blob: 50d62fa8e10f486156659c89297d9a5c67ced5be [file] [log] [blame]
Akrone4c2e412016-01-28 15:10:50 +01001package KorAP::XML::Tokenizer::Tokens;
2use Mojo::Base 'KorAP::XML::Tokenizer::Units';
Nils Diewald2db9ad02013-10-29 19:26:43 +00003use Mojo::ByteStream 'b';
Akrone4c2e412016-01-28 15:10:50 +01004use KorAP::XML::Tokenizer::Token;
Nils Diewald7364d1f2013-11-05 19:26:35 +00005use Carp qw/croak carp/;
6use XML::Fast;
Nils Diewald092178e2013-11-26 16:18:48 +00007use Try::Tiny;
Nils Diewald2db9ad02013-10-29 19:26:43 +00008
Nils Diewald092178e2013-11-26 16:18:48 +00009has 'log' => sub {
10 Log::Log4perl->get_logger(__PACKAGE__)
11};
Nils Diewald2db9ad02013-10-29 19:26:43 +000012
13sub parse {
14 my $self = shift;
Nils Diewald32e30f02014-10-30 00:52:36 +000015
Nils Diewald092178e2013-11-26 16:18:48 +000016 my $path = $self->path . $self->foundry . '/' . $self->layer . '.xml';
Nils Diewald38b3b5a2013-12-04 00:54:08 +000017
Nils Diewald32e30f02014-10-30 00:52:36 +000018 # Legacy data support
19 unless (-e $path) {
20 if ($self->layer eq 'namedentities') {
21 $path = $self->path . $self->foundry . '/ne_combined.xml';
22 return unless -e $path;
23 }
24 elsif ($self->layer eq 'morpho' && $self->foundry eq 'glemm') {
25 $path = $self->path . $self->foundry . '/glemm.xml';
26 return unless -e $path;
27 }
28 else {
29 return;
30 };
31 };
Nils Diewald38b3b5a2013-12-04 00:54:08 +000032
Nils Diewald092178e2013-11-26 16:18:48 +000033 my $file = b($path)->slurp;
Nils Diewald2db9ad02013-10-29 19:26:43 +000034
Nils Diewalda96de622014-10-31 17:29:23 +000035 # Bug workaround
36 if ($self->foundry eq 'glemm') {
37 if (index($file, "</span\n") > 0) {
38 $file =~ s!</span$!</span>!gm
39 };
40 };
41
Nils Diewald7364d1f2013-11-05 19:26:35 +000042# my $spans = Mojo::DOM->new($file);
43# $spans->xml(1);
Nils Diewaldded8e832013-11-06 15:42:17 +000044
Nils Diewaldaba47102013-11-27 15:02:47 +000045 my ($spans, $error);
Nils Diewald092178e2013-11-26 16:18:48 +000046 try {
47 local $SIG{__WARN__} = sub {
Nils Diewaldaba47102013-11-27 15:02:47 +000048 $error = 1;
Nils Diewald092178e2013-11-26 16:18:48 +000049 };
Nils Diewald092178e2013-11-26 16:18:48 +000050 $spans = xml2hash($file, text => '#text', attr => '-')->{layer}->{spanList};
51 }
52 catch {
Nils Diewaldaba47102013-11-27 15:02:47 +000053 $self->log->warn('Span error in ' . $path . ($_ ? ': ' . $_ : ''));
54 $error = 1;
Nils Diewald092178e2013-11-26 16:18:48 +000055 };
56
Nils Diewaldaba47102013-11-27 15:02:47 +000057 return if $error;
58
Nils Diewald092178e2013-11-26 16:18:48 +000059 if (ref $spans && $spans->{span}) {
60 $spans = $spans->{span};
61 }
62 else {
63 return [];
64 };
65
66 $spans = [$spans] if ref $spans ne 'ARRAY';
Nils Diewald2db9ad02013-10-29 19:26:43 +000067
68 my ($should, $have) = (0,0);
Nils Diewald2db9ad02013-10-29 19:26:43 +000069
70 my @tokens;
Nils Diewald2db9ad02013-10-29 19:26:43 +000071
Nils Diewald7364d1f2013-11-05 19:26:35 +000072 foreach my $s (@$spans) {
Nils Diewald2db9ad02013-10-29 19:26:43 +000073
Nils Diewald7364d1f2013-11-05 19:26:35 +000074 $should++;
Nils Diewald2db9ad02013-10-29 19:26:43 +000075
Nils Diewald7364d1f2013-11-05 19:26:35 +000076 my $token = $self->token(
77 $s->{-from},
78 $s->{-to},
79 $s
80 ) or next;
Nils Diewald2db9ad02013-10-29 19:26:43 +000081
Nils Diewald7364d1f2013-11-05 19:26:35 +000082 $have++;
Nils Diewald2db9ad02013-10-29 19:26:43 +000083
Nils Diewald7364d1f2013-11-05 19:26:35 +000084 push(@tokens, $token);
85 };
Nils Diewald2db9ad02013-10-29 19:26:43 +000086
87 $self->should($should);
88 $self->have($have);
89
90 return \@tokens;
91};
92
93
941;