blob: a863c43bf47cd77af85e0f6f6e020d58f08fccab [file] [log] [blame]
Akronc29b8e12019-12-16 14:28:09 +01001package KorAP::XML::Annotation::DGD::Structure;
2use KorAP::XML::Annotation::Base;
3use List::Util qw/uniq/;
4
5# This handler introduces pseudo sentences
6# based on anchor texts in AGD. A sentence is defined as
7# being the span between
8# a) two empty anchor elements, or
9# b) an anchor element and the start of the doc, or
10# c) an anchor element and the end of the doc.
11
12sub parse {
13 my $self = shift;
14
15 my @milestones = ();
16 my ($p_start, $o_start) = (0,0);
17 my ($last_p, $last_o) = (0,0);
18
19 $$self->add_spandata(
20 foundry => 'struct',
21 layer => 'structure',
22 cb => sub {
23 my ($stream, $span) = @_;
24
25 # Read feature
Akronfa82f042020-08-04 12:56:29 +020026 my $feature = $span->get_hash->{fs}->{f};
Akronc29b8e12019-12-16 14:28:09 +010027 my $attrs;
28
29 # Get attributes
30 if (ref $feature eq 'ARRAY') {
31 $attrs = $feature->[1]->{fs}->{f};
32 $attrs = ref $attrs eq 'ARRAY' ? $attrs : [$attrs];
33 $feature = $feature->[0];
34 };
35
36 # Get term label
37 my $name = $feature->{'#text'};
38
39 # Check only for anchors
40 if ($name eq 'anchor') {
Akronfa82f042020-08-04 12:56:29 +020041 push @milestones, [ $span->get_p_start, $span->get_o_start ];
Akronc29b8e12019-12-16 14:28:09 +010042 } else {
Akronfa82f042020-08-04 12:56:29 +020043 $last_p = $span->get_p_start;
44 $last_o = $span->get_o_end;
Akronc29b8e12019-12-16 14:28:09 +010045 }
46 }
47 ) or return;
48
49 my $sentences = 0;
50
51 # Add final position
52 push @milestones, [$last_p, $last_o];
53
54 # Sort and unique milestones
55 @milestones = sort {
56 $a->[0] <=> $b->[0]
57 } @milestones;
58
59 my $stream = $$self->stream;
60
61 # Iterate overs milestones
62 foreach (@milestones) {
63
64 if (($_->[0] == $p_start) || ($_->[1] == $o_start)) {
65 next;
66 };
67
68 my $mtt = $stream->pos($p_start);
69
70 # Add the base sentence
Akron39df7ce2020-08-04 15:55:26 +020071 my $mt = $mtt->add_by_term('<>:base/s:s');
Akron4701d092020-08-04 15:20:19 +020072 $mt->set_o_start($o_start);
73 $mt->set_o_end($_->[1]);
Akron4701d092020-08-04 15:20:19 +020074 $mt->set_p_end($_->[0]);
75 $mt->set_pti(64);
Akron72e671f2020-08-04 11:35:40 +020076 $mt->set_payload('<b>1');
Akronc29b8e12019-12-16 14:28:09 +010077 $sentences++;
78
79 $p_start = $_->[0];
80 $o_start = $_->[1];
81 }
82
83 # Set meta information about sentence count
84 $stream->add_meta('base/sentences', '<i>' . $sentences);
85
86 return 1;
87};
88
89sub layer_info {
90 [];
91};
92
93
941;
Akron1cdbc9d2020-05-07 15:28:54 +020095
Akronc29b8e12019-12-16 14:28:09 +010096__END__