| Akron | c29b8e1 | 2019-12-16 14:28:09 +0100 | [diff] [blame] | 1 | package KorAP::XML::Annotation::DGD::Structure; |
| 2 | use KorAP::XML::Annotation::Base; |
| 3 | use List::Util qw/uniq/; |
| 4 | |
| 5 | # This handler introduces pseudo sentences |
| 6 | # based on anchor texts in AGD. A sentence is defined as |
| 7 | # being the span between |
| 8 | # a) two empty anchor elements, or |
| 9 | # b) an anchor element and the start of the doc, or |
| 10 | # c) an anchor element and the end of the doc. |
| 11 | |
| 12 | sub parse { |
| 13 | my $self = shift; |
| 14 | |
| 15 | my @milestones = (); |
| 16 | my ($p_start, $o_start) = (0,0); |
| 17 | my ($last_p, $last_o) = (0,0); |
| 18 | |
| 19 | $$self->add_spandata( |
| 20 | foundry => 'struct', |
| 21 | layer => 'structure', |
| 22 | cb => sub { |
| 23 | my ($stream, $span) = @_; |
| 24 | |
| 25 | # Read feature |
| Akron | fa82f04 | 2020-08-04 12:56:29 +0200 | [diff] [blame] | 26 | my $feature = $span->get_hash->{fs}->{f}; |
| Akron | c29b8e1 | 2019-12-16 14:28:09 +0100 | [diff] [blame] | 27 | my $attrs; |
| 28 | |
| 29 | # Get attributes |
| 30 | if (ref $feature eq 'ARRAY') { |
| 31 | $attrs = $feature->[1]->{fs}->{f}; |
| 32 | $attrs = ref $attrs eq 'ARRAY' ? $attrs : [$attrs]; |
| 33 | $feature = $feature->[0]; |
| 34 | }; |
| 35 | |
| 36 | # Get term label |
| 37 | my $name = $feature->{'#text'}; |
| 38 | |
| 39 | # Check only for anchors |
| 40 | if ($name eq 'anchor') { |
| Akron | fa82f04 | 2020-08-04 12:56:29 +0200 | [diff] [blame] | 41 | push @milestones, [ $span->get_p_start, $span->get_o_start ]; |
| Akron | c29b8e1 | 2019-12-16 14:28:09 +0100 | [diff] [blame] | 42 | } else { |
| Akron | fa82f04 | 2020-08-04 12:56:29 +0200 | [diff] [blame] | 43 | $last_p = $span->get_p_start; |
| 44 | $last_o = $span->get_o_end; |
| Akron | c29b8e1 | 2019-12-16 14:28:09 +0100 | [diff] [blame] | 45 | } |
| 46 | } |
| 47 | ) or return; |
| 48 | |
| 49 | my $sentences = 0; |
| 50 | |
| 51 | # Add final position |
| 52 | push @milestones, [$last_p, $last_o]; |
| 53 | |
| 54 | # Sort and unique milestones |
| 55 | @milestones = sort { |
| 56 | $a->[0] <=> $b->[0] |
| 57 | } @milestones; |
| 58 | |
| 59 | my $stream = $$self->stream; |
| 60 | |
| 61 | # Iterate overs milestones |
| 62 | foreach (@milestones) { |
| 63 | |
| 64 | if (($_->[0] == $p_start) || ($_->[1] == $o_start)) { |
| 65 | next; |
| 66 | }; |
| 67 | |
| 68 | my $mtt = $stream->pos($p_start); |
| 69 | |
| 70 | # Add the base sentence |
| Akron | 39df7ce | 2020-08-04 15:55:26 +0200 | [diff] [blame] | 71 | my $mt = $mtt->add_by_term('<>:base/s:s'); |
| Akron | 4701d09 | 2020-08-04 15:20:19 +0200 | [diff] [blame] | 72 | $mt->set_o_start($o_start); |
| 73 | $mt->set_o_end($_->[1]); |
| Akron | 4701d09 | 2020-08-04 15:20:19 +0200 | [diff] [blame] | 74 | $mt->set_p_end($_->[0]); |
| 75 | $mt->set_pti(64); |
| Akron | 72e671f | 2020-08-04 11:35:40 +0200 | [diff] [blame] | 76 | $mt->set_payload('<b>1'); |
| Akron | c29b8e1 | 2019-12-16 14:28:09 +0100 | [diff] [blame] | 77 | $sentences++; |
| 78 | |
| 79 | $p_start = $_->[0]; |
| 80 | $o_start = $_->[1]; |
| 81 | } |
| 82 | |
| 83 | # Set meta information about sentence count |
| 84 | $stream->add_meta('base/sentences', '<i>' . $sentences); |
| 85 | |
| 86 | return 1; |
| 87 | }; |
| 88 | |
| 89 | sub layer_info { |
| 90 | []; |
| 91 | }; |
| 92 | |
| 93 | |
| 94 | 1; |
| Akron | 1cdbc9d | 2020-05-07 15:28:54 +0200 | [diff] [blame] | 95 | |
| Akron | c29b8e1 | 2019-12-16 14:28:09 +0100 | [diff] [blame] | 96 | __END__ |