blob: 2e215659941ae039b72c2b678fdea38c913ad0cd [file] [log] [blame]
package KorAP::Index::CoreNLP::Constituency;
use KorAP::Index::Base;
use Set::Scalar;
sub parse {
my $self = shift;
# Collect all spans and check for roots
my %corenlp_const;
my $corenlp_const_root = Set::Scalar->new;
my $corenlp_const_noroot = Set::Scalar->new;
# First run:
$$self->add_spandata(
foundry => 'corenlp',
layer => 'constituency',
cb => sub {
my ($stream, $span) = @_;
$corenlp_const{$span->id} = $span;
# Maybe root
$corenlp_const_root->insert($span->id);
my $rel = $span->hash->{rel} or return;
# Make rel an array in case it's not
$rel = [$rel] unless ref $rel eq 'ARRAY';
foreach (@$rel) {
if ($_->{-label} eq 'dominates') {
if ($_->{-target}) {
$corenlp_const_noroot->insert($_->{-target});
}
elsif (my $uri = $_->{-uri}) {
$uri =~ s/^morpho\.xml#//;
$corenlp_const_noroot->insert($uri);
};
};
};
}
) or return;
my $stream = $$self->stream;
my $add_const;
$add_const = sub {
my $span = shift;
my $level = shift;
my $mtt = $stream->pos($span->p_start);
my $content = $span->hash;
my $f = $content->{fs}->{f};
return unless $f->{-name} eq 'const';
my $type = $f->{'#text'} or return;
# $type is now NPA, NP, NUM ...
my %term = (
term => '<>:corenlp/c:' . $type,
o_start => $span->o_start,
o_end => $span->o_end,
p_end => $span->p_end,
pti => 64
);
$term{payload} = '<b>' . ($level // 0);
$mtt->add(%term);
my $this = $add_const;
my $rel = $content->{rel} or return;
$rel = [$rel] unless ref $rel eq 'ARRAY';
foreach (@$rel) {
next if $_->{-label} ne 'dominates' || !$_->{-target};
my $subspan = delete $corenlp_const{$_->{-target}} or return;
$this->($subspan, $level + 1);
};
};
# Next run
my $diff = $corenlp_const_root->difference($corenlp_const_noroot);
# Iterate over all roots
foreach ($diff->members) {
# Get root span based on root id
my $obj = delete $corenlp_const{$_} or next;
# Start on level 0
$add_const->($obj, 0);
};
return 1;
};
sub layer_info {
['corenlp/c=spans']
};
1;