| Akron | 151676d | 2016-03-14 20:12:14 +0100 | [diff] [blame] | 1 | package KorAP::XML::Annotation::XIP::Constituency; |
| 2 | use KorAP::XML::Annotation::Base; |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 3 | use Set::Scalar; |
| Nils Diewald | 47c3ef3 | 2014-04-30 19:13:17 +0000 | [diff] [blame] | 4 | use Scalar::Util qw/weaken/; |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 5 | |
| Nils Diewald | 47c3ef3 | 2014-04-30 19:13:17 +0000 | [diff] [blame] | 6 | our $URI_RE = qr/^[^\#]+\#(.+?)$/; |
| 7 | |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 8 | sub parse { |
| 9 | my $self = shift; |
| 10 | |
| Nils Diewald | 47c3ef3 | 2014-04-30 19:13:17 +0000 | [diff] [blame] | 11 | # Collect all spans |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 12 | my %xip_const; |
| Nils Diewald | 47c3ef3 | 2014-04-30 19:13:17 +0000 | [diff] [blame] | 13 | |
| 14 | # Collect all roots |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 15 | my $xip_const_root = Set::Scalar->new; |
| Nils Diewald | 47c3ef3 | 2014-04-30 19:13:17 +0000 | [diff] [blame] | 16 | |
| 17 | # Collect all non-roots |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 18 | my $xip_const_noroot = Set::Scalar->new; |
| 19 | |
| 20 | # First run: |
| 21 | $$self->add_spandata( |
| 22 | foundry => 'xip', |
| 23 | layer => 'constituency', |
| 24 | encoding => 'xip', |
| 25 | cb => sub { |
| 26 | my ($stream, $span) = @_; |
| 27 | |
| Nils Diewald | 47c3ef3 | 2014-04-30 19:13:17 +0000 | [diff] [blame] | 28 | # Collect the span |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 29 | $xip_const{$span->id} = $span; |
| Nils Diewald | 47c3ef3 | 2014-04-30 19:13:17 +0000 | [diff] [blame] | 30 | |
| 31 | # It's probably a root |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 32 | $xip_const_root->insert($span->id); |
| 33 | |
| 34 | my $rel = $span->hash->{rel} or return; |
| 35 | $rel = [$rel] unless ref $rel eq 'ARRAY'; |
| 36 | |
| Nils Diewald | 6d56507 | 2014-10-30 23:20:58 +0000 | [diff] [blame] | 37 | # Iterate over all relations |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 38 | foreach (@$rel) { |
| Nils Diewald | 47c3ef3 | 2014-04-30 19:13:17 +0000 | [diff] [blame] | 39 | if ($_->{-label} eq 'dominates') { |
| 40 | |
| 41 | my $target = $_->{-target}; |
| 42 | if (!$target && $_->{-uri} && |
| 43 | $_->{-uri} =~ $URI_RE) { |
| 44 | $target = $1; |
| 45 | }; |
| 46 | |
| Nils Diewald | 6d56507 | 2014-10-30 23:20:58 +0000 | [diff] [blame] | 47 | # The target may not be addressable |
| Nils Diewald | 47c3ef3 | 2014-04-30 19:13:17 +0000 | [diff] [blame] | 48 | next unless $target; |
| 49 | |
| 50 | # It's definately not a root |
| 51 | $xip_const_noroot->insert($target); |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 52 | }; |
| 53 | }; |
| 54 | } |
| 55 | ) or return; |
| 56 | |
| Nils Diewald | 47c3ef3 | 2014-04-30 19:13:17 +0000 | [diff] [blame] | 57 | # Get the stream |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 58 | my $stream = $$self->stream; |
| 59 | |
| Nils Diewald | 47c3ef3 | 2014-04-30 19:13:17 +0000 | [diff] [blame] | 60 | # Recursive tree traversal method |
| Nils Diewald | 840c924 | 2014-10-28 19:51:26 +0000 | [diff] [blame] | 61 | my $add_const; |
| Nils Diewald | 6d56507 | 2014-10-30 23:20:58 +0000 | [diff] [blame] | 62 | $add_const = sub { |
| Nils Diewald | 47c3ef3 | 2014-04-30 19:13:17 +0000 | [diff] [blame] | 63 | my ($span, $level) = @_; |
| 64 | |
| 65 | weaken $xip_const_root; |
| 66 | weaken $xip_const_noroot; |
| 67 | |
| 68 | # Get the correct position for the span |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 69 | my $mtt = $stream->pos($span->p_start); |
| 70 | |
| 71 | my $content = $span->hash; |
| 72 | my $f = $content->{fs}->{f}; |
| Nils Diewald | 6d56507 | 2014-10-30 23:20:58 +0000 | [diff] [blame] | 73 | |
| Nils Diewald | ff6d078 | 2014-06-10 18:26:36 +0000 | [diff] [blame] | 74 | unless ($f->{-name} eq 'const') { |
| 75 | warn $f->{-id} . ' is no constant'; |
| 76 | return; |
| 77 | }; |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 78 | |
| Nils Diewald | ff6d078 | 2014-06-10 18:26:36 +0000 | [diff] [blame] | 79 | my $type = $f->{'#text'}; |
| 80 | |
| 81 | unless ($type) { |
| 82 | warn $f->{-id} . ' has no content'; |
| 83 | return; |
| 84 | }; |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 85 | |
| 86 | # $type is now NPA, NP, NUM ... |
| 87 | my %term = ( |
| Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 88 | term => '<>:xip/c:' . $type, |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 89 | o_start => $span->o_start, |
| 90 | o_end => $span->o_end, |
| Akron | 5f511d2 | 2016-01-05 20:54:34 +0100 | [diff] [blame] | 91 | p_end => $span->p_end, |
| 92 | pti => 64 |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 93 | ); |
| 94 | |
| Nils Diewald | 47c3ef3 | 2014-04-30 19:13:17 +0000 | [diff] [blame] | 95 | # Only add level payload if node != root |
| Nils Diewald | 6d56507 | 2014-10-30 23:20:58 +0000 | [diff] [blame] | 96 | $term{payload} ='<b>' . ($level // 0); |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 97 | |
| 98 | $mtt->add(%term); |
| 99 | |
| Nils Diewald | 840c924 | 2014-10-28 19:51:26 +0000 | [diff] [blame] | 100 | # my $this = __SUB__ |
| 101 | my $this = $add_const; |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 102 | |
| Nils Diewald | ff6d078 | 2014-06-10 18:26:36 +0000 | [diff] [blame] | 103 | my $rel = $content->{rel}; |
| 104 | |
| 105 | unless ($rel) { |
| Nils Diewald | f03c680 | 2014-07-21 16:39:44 +0000 | [diff] [blame] | 106 | warn $f->{-id} . ' has no relation' if $f->{-id}; |
| Nils Diewald | ff6d078 | 2014-06-10 18:26:36 +0000 | [diff] [blame] | 107 | return; |
| 108 | }; |
| 109 | |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 110 | $rel = [$rel] unless ref $rel eq 'ARRAY'; |
| 111 | |
| 112 | foreach (@$rel) { |
| Nils Diewald | 47c3ef3 | 2014-04-30 19:13:17 +0000 | [diff] [blame] | 113 | next if $_->{-label} ne 'dominates'; |
| 114 | my $target; |
| 115 | |
| 116 | $target = $_->{-target}; |
| 117 | if (!$target && $_->{-uri} && $_->{-uri} =~ $URI_RE) { |
| 118 | $target = $1; |
| 119 | }; |
| 120 | |
| 121 | next unless $target; |
| 122 | |
| 123 | my $subspan = delete $xip_const{$target}; |
| Nils Diewald | f03c680 | 2014-07-21 16:39:44 +0000 | [diff] [blame] | 124 | return unless $subspan; |
| 125 | # warn "Span " . $target . " not found"; |
| 126 | |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 127 | $this->($subspan, $level + 1); |
| 128 | }; |
| 129 | }; |
| 130 | |
| Nils Diewald | 47c3ef3 | 2014-04-30 19:13:17 +0000 | [diff] [blame] | 131 | # Calculate all roots |
| 132 | my $roots = $xip_const_root->difference($xip_const_noroot); |
| 133 | |
| 134 | # Start tree traversal from the root |
| 135 | foreach ($roots->members) { |
| Nils Diewald | ff6d078 | 2014-06-10 18:26:36 +0000 | [diff] [blame] | 136 | |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 137 | my $obj = delete $xip_const{$_} or next; |
| Nils Diewald | ff6d078 | 2014-06-10 18:26:36 +0000 | [diff] [blame] | 138 | |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 139 | $add_const->($obj, 0); |
| 140 | }; |
| 141 | |
| 142 | return 1; |
| 143 | }; |
| 144 | |
| Nils Diewald | 47c3ef3 | 2014-04-30 19:13:17 +0000 | [diff] [blame] | 145 | |
| 146 | # Layer info |
| Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 147 | sub layer_info { |
| Nils Diewald | 6d56507 | 2014-10-30 23:20:58 +0000 | [diff] [blame] | 148 | ['xip/c=spans'] |
| 149 | }; |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 150 | |
| 151 | 1; |