blob: 6961b458068ba38ae84352cdac9ced2599412e1e [file] [log] [blame]
Akronc29b8e12019-12-16 14:28:09 +01001package KorAP::XML::Annotation::DGD::Structure;
2use KorAP::XML::Annotation::Base;
3use List::Util qw/uniq/;
4
5# This handler introduces pseudo sentences
6# based on anchor texts in AGD. A sentence is defined as
7# being the span between
8# a) two empty anchor elements, or
9# b) an anchor element and the start of the doc, or
10# c) an anchor element and the end of the doc.
11
12sub parse {
13 my $self = shift;
14
15 my @milestones = ();
16 my ($p_start, $o_start) = (0,0);
17 my ($last_p, $last_o) = (0,0);
18
19 $$self->add_spandata(
20 foundry => 'struct',
21 layer => 'structure',
22 cb => sub {
23 my ($stream, $span) = @_;
24
25 # Read feature
26 my $feature = $span->hash->{fs}->{f};
27 my $attrs;
28
29 # Get attributes
30 if (ref $feature eq 'ARRAY') {
31 $attrs = $feature->[1]->{fs}->{f};
32 $attrs = ref $attrs eq 'ARRAY' ? $attrs : [$attrs];
33 $feature = $feature->[0];
34 };
35
36 # Get term label
37 my $name = $feature->{'#text'};
38
39 # Check only for anchors
40 if ($name eq 'anchor') {
41 push @milestones, [ $span->p_start, $span->o_start ];
42 } else {
43 $last_p = $span->p_start;
44 $last_o = $span->o_end;
45 }
46 }
47 ) or return;
48
49 my $sentences = 0;
50
51 # Add final position
52 push @milestones, [$last_p, $last_o];
53
54 # Sort and unique milestones
55 @milestones = sort {
56 $a->[0] <=> $b->[0]
57 } @milestones;
58
59 my $stream = $$self->stream;
60
61 # Iterate overs milestones
62 foreach (@milestones) {
63
64 if (($_->[0] == $p_start) || ($_->[1] == $o_start)) {
65 next;
66 };
67
68 my $mtt = $stream->pos($p_start);
69
70 # Add the base sentence
71 my $mt = $mtt->add(
72 term => '<>:base/s:s',
73 o_start => $o_start,
74 o_end => $_->[1],
75 p_start => $p_start,
76 p_end => $_->[0] + 1,
77 pti => 64
78 );
79 $mt->payload('<b>1');
80 $sentences++;
81
82 $p_start = $_->[0];
83 $o_start = $_->[1];
84 }
85
86 # Set meta information about sentence count
87 $stream->add_meta('base/sentences', '<i>' . $sentences);
88
89 return 1;
90};
91
92sub layer_info {
93 [];
94};
95
96
971;
98__END__