| Akron | c29b8e1 | 2019-12-16 14:28:09 +0100 | [diff] [blame^] | 1 | package KorAP::XML::Annotation::DGD::Structure; |
| 2 | use KorAP::XML::Annotation::Base; |
| 3 | use List::Util qw/uniq/; |
| 4 | |
| 5 | # This handler introduces pseudo sentences |
| 6 | # based on anchor texts in AGD. A sentence is defined as |
| 7 | # being the span between |
| 8 | # a) two empty anchor elements, or |
| 9 | # b) an anchor element and the start of the doc, or |
| 10 | # c) an anchor element and the end of the doc. |
| 11 | |
| 12 | sub parse { |
| 13 | my $self = shift; |
| 14 | |
| 15 | my @milestones = (); |
| 16 | my ($p_start, $o_start) = (0,0); |
| 17 | my ($last_p, $last_o) = (0,0); |
| 18 | |
| 19 | $$self->add_spandata( |
| 20 | foundry => 'struct', |
| 21 | layer => 'structure', |
| 22 | cb => sub { |
| 23 | my ($stream, $span) = @_; |
| 24 | |
| 25 | # Read feature |
| 26 | my $feature = $span->hash->{fs}->{f}; |
| 27 | my $attrs; |
| 28 | |
| 29 | # Get attributes |
| 30 | if (ref $feature eq 'ARRAY') { |
| 31 | $attrs = $feature->[1]->{fs}->{f}; |
| 32 | $attrs = ref $attrs eq 'ARRAY' ? $attrs : [$attrs]; |
| 33 | $feature = $feature->[0]; |
| 34 | }; |
| 35 | |
| 36 | # Get term label |
| 37 | my $name = $feature->{'#text'}; |
| 38 | |
| 39 | # Check only for anchors |
| 40 | if ($name eq 'anchor') { |
| 41 | push @milestones, [ $span->p_start, $span->o_start ]; |
| 42 | } else { |
| 43 | $last_p = $span->p_start; |
| 44 | $last_o = $span->o_end; |
| 45 | } |
| 46 | } |
| 47 | ) or return; |
| 48 | |
| 49 | my $sentences = 0; |
| 50 | |
| 51 | # Add final position |
| 52 | push @milestones, [$last_p, $last_o]; |
| 53 | |
| 54 | # Sort and unique milestones |
| 55 | @milestones = sort { |
| 56 | $a->[0] <=> $b->[0] |
| 57 | } @milestones; |
| 58 | |
| 59 | my $stream = $$self->stream; |
| 60 | |
| 61 | # Iterate overs milestones |
| 62 | foreach (@milestones) { |
| 63 | |
| 64 | if (($_->[0] == $p_start) || ($_->[1] == $o_start)) { |
| 65 | next; |
| 66 | }; |
| 67 | |
| 68 | my $mtt = $stream->pos($p_start); |
| 69 | |
| 70 | # Add the base sentence |
| 71 | my $mt = $mtt->add( |
| 72 | term => '<>:base/s:s', |
| 73 | o_start => $o_start, |
| 74 | o_end => $_->[1], |
| 75 | p_start => $p_start, |
| 76 | p_end => $_->[0] + 1, |
| 77 | pti => 64 |
| 78 | ); |
| 79 | $mt->payload('<b>1'); |
| 80 | $sentences++; |
| 81 | |
| 82 | $p_start = $_->[0]; |
| 83 | $o_start = $_->[1]; |
| 84 | } |
| 85 | |
| 86 | # Set meta information about sentence count |
| 87 | $stream->add_meta('base/sentences', '<i>' . $sentences); |
| 88 | |
| 89 | return 1; |
| 90 | }; |
| 91 | |
| 92 | sub layer_info { |
| 93 | []; |
| 94 | }; |
| 95 | |
| 96 | |
| 97 | 1; |
| 98 | __END__ |