| Akron | e4c2e41 | 2016-01-28 15:10:50 +0100 | [diff] [blame] | 1 | package KorAP::XML::Tokenizer::Tokens; |
| 2 | use Mojo::Base 'KorAP::XML::Tokenizer::Units'; | ||||
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 3 | use Mojo::ByteStream 'b'; |
| Akron | e4c2e41 | 2016-01-28 15:10:50 +0100 | [diff] [blame] | 4 | use KorAP::XML::Tokenizer::Token; |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 5 | use Carp qw/croak carp/; |
| 6 | use XML::Fast; | ||||
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 7 | use Try::Tiny; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 8 | |
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 9 | has 'log' => sub { |
| 10 | Log::Log4perl->get_logger(__PACKAGE__) | ||||
| 11 | }; | ||||
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 12 | |
| 13 | sub parse { | ||||
| 14 | my $self = shift; | ||||
| Nils Diewald | 32e30f0 | 2014-10-30 00:52:36 +0000 | [diff] [blame] | 15 | |
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 16 | my $path = $self->path . $self->foundry . '/' . $self->layer . '.xml'; |
| Nils Diewald | 38b3b5a | 2013-12-04 00:54:08 +0000 | [diff] [blame] | 17 | |
| Nils Diewald | 32e30f0 | 2014-10-30 00:52:36 +0000 | [diff] [blame] | 18 | # Legacy data support |
| 19 | unless (-e $path) { | ||||
| 20 | if ($self->layer eq 'namedentities') { | ||||
| 21 | $path = $self->path . $self->foundry . '/ne_combined.xml'; | ||||
| 22 | return unless -e $path; | ||||
| 23 | } | ||||
| 24 | elsif ($self->layer eq 'morpho' && $self->foundry eq 'glemm') { | ||||
| 25 | $path = $self->path . $self->foundry . '/glemm.xml'; | ||||
| 26 | return unless -e $path; | ||||
| 27 | } | ||||
| 28 | else { | ||||
| 29 | return; | ||||
| 30 | }; | ||||
| 31 | }; | ||||
| Nils Diewald | 38b3b5a | 2013-12-04 00:54:08 +0000 | [diff] [blame] | 32 | |
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 33 | my $file = b($path)->slurp; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 34 | |
| Nils Diewald | a96de62 | 2014-10-31 17:29:23 +0000 | [diff] [blame] | 35 | # Bug workaround |
| 36 | if ($self->foundry eq 'glemm') { | ||||
| 37 | if (index($file, "</span\n") > 0) { | ||||
| 38 | $file =~ s!</span$!</span>!gm | ||||
| 39 | }; | ||||
| 40 | }; | ||||
| 41 | |||||
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 42 | # my $spans = Mojo::DOM->new($file); |
| 43 | # $spans->xml(1); | ||||
| Nils Diewald | ded8e83 | 2013-11-06 15:42:17 +0000 | [diff] [blame] | 44 | |
| Nils Diewald | aba4710 | 2013-11-27 15:02:47 +0000 | [diff] [blame] | 45 | my ($spans, $error); |
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 46 | try { |
| 47 | local $SIG{__WARN__} = sub { | ||||
| Nils Diewald | aba4710 | 2013-11-27 15:02:47 +0000 | [diff] [blame] | 48 | $error = 1; |
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 49 | }; |
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 50 | $spans = xml2hash($file, text => '#text', attr => '-')->{layer}->{spanList}; |
| 51 | } | ||||
| 52 | catch { | ||||
| Nils Diewald | aba4710 | 2013-11-27 15:02:47 +0000 | [diff] [blame] | 53 | $self->log->warn('Span error in ' . $path . ($_ ? ': ' . $_ : '')); |
| 54 | $error = 1; | ||||
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 55 | }; |
| 56 | |||||
| Nils Diewald | aba4710 | 2013-11-27 15:02:47 +0000 | [diff] [blame] | 57 | return if $error; |
| 58 | |||||
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 59 | if (ref $spans && $spans->{span}) { |
| 60 | $spans = $spans->{span}; | ||||
| 61 | } | ||||
| 62 | else { | ||||
| 63 | return []; | ||||
| 64 | }; | ||||
| 65 | |||||
| 66 | $spans = [$spans] if ref $spans ne 'ARRAY'; | ||||
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 67 | |
| 68 | my ($should, $have) = (0,0); | ||||
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 69 | |
| 70 | my @tokens; | ||||
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 71 | |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 72 | foreach my $s (@$spans) { |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 73 | |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 74 | $should++; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 75 | |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 76 | my $token = $self->token( |
| 77 | $s->{-from}, | ||||
| 78 | $s->{-to}, | ||||
| 79 | $s | ||||
| 80 | ) or next; | ||||
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 81 | |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 82 | $have++; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 83 | |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 84 | push(@tokens, $token); |
| 85 | }; | ||||
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 86 | |
| 87 | $self->should($should); | ||||
| 88 | $self->have($have); | ||||
| 89 | |||||
| 90 | return \@tokens; | ||||
| 91 | }; | ||||
| 92 | |||||
| 93 | |||||
| 94 | 1; | ||||