Found some bugs in XIP/Constituency ... and introduced some new ones - yay
diff --git a/lib/KorAP/Index/XIP/Constituency.pm b/lib/KorAP/Index/XIP/Constituency.pm
index 341a9da..ce43a7a 100644
--- a/lib/KorAP/Index/XIP/Constituency.pm
+++ b/lib/KorAP/Index/XIP/Constituency.pm
@@ -1,14 +1,21 @@
package KorAP::Index::XIP::Constituency;
use KorAP::Index::Base;
use Set::Scalar;
+use Scalar::Util qw/weaken/;
use v5.16;
+our $URI_RE = qr/^[^\#]+\#(.+?)$/;
+
sub parse {
my $self = shift;
- # Collect all spans and check for roots
+ # Collect all spans
my %xip_const;
+
+ # Collect all roots
my $xip_const_root = Set::Scalar->new;
+
+ # Collect all non-roots
my $xip_const_noroot = Set::Scalar->new;
# First run:
@@ -19,25 +26,44 @@
cb => sub {
my ($stream, $span) = @_;
+ # Collect the span
$xip_const{$span->id} = $span;
+
+ # It's probably a root
$xip_const_root->insert($span->id);
my $rel = $span->hash->{rel} or return;
$rel = [$rel] unless ref $rel eq 'ARRAY';
foreach (@$rel) {
- if ($_->{-label} eq 'dominates' && $_->{-target}) {
- $xip_const_noroot->insert($_->{-target});
+ if ($_->{-label} eq 'dominates') {
+
+ my $target = $_->{-target};
+ if (!$target && $_->{-uri} &&
+ $_->{-uri} =~ $URI_RE) {
+ $target = $1;
+ };
+
+ next unless $target;
+
+ # It's definately not a root
+ $xip_const_noroot->insert($target);
};
};
}
) or return;
+ # Get the stream
my $stream = $$self->stream;
+ # Recursive tree traversal method
my $add_const = sub {
- my $span = shift;
- my $level = shift;
+ my ($span, $level) = @_;
+
+ weaken $xip_const_root;
+ weaken $xip_const_noroot;
+
+ # Get the correct position for the span
my $mtt = $stream->pos($span->p_start);
my $content = $span->hash;
@@ -54,6 +80,7 @@
p_end => $span->p_end
);
+ # Only add level payload if node != root
$term{payload} = '<b>' . $level if $level;
$mtt->add(%term);
@@ -64,14 +91,30 @@
$rel = [$rel] unless ref $rel eq 'ARRAY';
foreach (@$rel) {
- next if $_->{-label} ne 'dominates' || !$_->{-target};
- my $subspan = delete $xip_const{$_->{-target}} or return;
+ next if $_->{-label} ne 'dominates';
+ my $target;
+
+ $target = $_->{-target};
+ if (!$target && $_->{-uri} && $_->{-uri} =~ $URI_RE) {
+ $target = $1;
+ };
+
+ next unless $target;
+
+ my $subspan = delete $xip_const{$target};
+ unless ($subspan) {
+ warn "Span " . $target . " not found";
+ return;
+ };
$this->($subspan, $level + 1);
};
};
- my $diff = $xip_const_root->difference($xip_const_noroot);
- foreach ($diff->members) {
+ # Calculate all roots
+ my $roots = $xip_const_root->difference($xip_const_noroot);
+
+ # Start tree traversal from the root
+ foreach ($roots->members) {
my $obj = delete $xip_const{$_} or next;
$add_const->($obj, 0);
};
@@ -79,6 +122,8 @@
return 1;
};
+
+# Layer info
sub layer_info {
['xip/c=const']
}
diff --git a/lib/KorAP/Index/XIP/Morpho.pm b/lib/KorAP/Index/XIP/Morpho.pm
index a6d0323..1eae6c6 100644
--- a/lib/KorAP/Index/XIP/Morpho.pm
+++ b/lib/KorAP/Index/XIP/Morpho.pm
@@ -19,7 +19,8 @@
my $capital = 0;
foreach (@$content) {
# pos
- if (($_->{-name} eq 'pos') && ($found = $_->{'#text'})) {
+ if (($_->{-name} eq 'pos') &&
+ ($found = $_->{'#text'})) {
$mtt->add(
term => 'xip/p:' . $found
);
@@ -30,7 +31,8 @@
foreach (@$content) {
# lemma
- if (($_->{-name} eq 'lemma') && ($found = $_->{'#text'})) {
+ if (($_->{-name} eq 'lemma') &&
+ ($found = $_->{'#text'})) {
# Verb delimiter (aus=druecken)
$mtt->add(term => 'xip/l:' . $found);
@@ -42,9 +44,6 @@
my (@token) = split('#', $found);
if (@token == 1) {
-# my $x = $token[0];
-# $x =~ s{/\w+$}{};
-# $mtt->add(term => 'xip/l:' . $x);
next;
};
my $full = '';
@@ -53,12 +52,6 @@
$_ =~ s{/\w+$}{};
$mtt->add(term => 'xip/l:#' . $_);
};
-# if (@token > 1) {
-# $full =~ s{/}{}g;
-# $full = lc $full;
-# $full = $capital ? ucfirst($full) : $full;
-# $mtt->add(term => 'xip/l:' . $full);
-# };
};
};
}) or return;
diff --git a/lib/KorAP/Tokenizer.pm b/lib/KorAP/Tokenizer.pm
index 5ac0e64..343a85c 100644
--- a/lib/KorAP/Tokenizer.pm
+++ b/lib/KorAP/Tokenizer.pm
@@ -175,8 +175,6 @@
%param
);
- use Data::Dumper;
-
my $spanarray = $spans->parse or return;
if ($spans->should == $spans->have) {
diff --git a/lib/KorAP/Tokenizer/Range.pm b/lib/KorAP/Tokenizer/Range.pm
index a477820..5fa2a38 100644
--- a/lib/KorAP/Tokenizer/Range.pm
+++ b/lib/KorAP/Tokenizer/Range.pm
@@ -49,9 +49,17 @@
return;
};
- if ($found =~ /!(\d+):(\d+)$/) {
+ if ($found =~ /^!(\d+):(\d+)$/) {
return $2;
};
};
+
+sub to_string {
+ my $self = shift;
+ return join('', map {'['.join(',',@$_).']'}
+ @{$$self->get_range(0,100,'...')})
+ . '...';
+};
+
1;
diff --git a/lib/KorAP/Tokenizer/Units.pm b/lib/KorAP/Tokenizer/Units.pm
index 5deead0..a073fef 100644
--- a/lib/KorAP/Tokenizer/Units.pm
+++ b/lib/KorAP/Tokenizer/Units.pm
@@ -24,7 +24,6 @@
$span->o_end($to);
my $start = $self->match->startswith($span->o_start);
-
unless (defined $start) {
$start = $self->range->after($span->o_start) or return;
};
@@ -32,9 +31,11 @@
$span->p_start($start);
my $end = $self->match->endswith($span->o_end);
- unless ($end) {
+
+ unless (defined $end) {
$end = $self->range->before($span->o_end) or return;
};
+
# $span->p_end($end);
# return unless $span->p_end >= $span->p_start;
@@ -57,11 +58,6 @@
return unless defined $pos;
-# if ($from == $to) {
-# print "Unable to find match for $from - $to (resp ".$s->{-from} . '-' . $s->{-to}.") " . $s->{-id};
-# print "\n";
-# };
-
my $token = KorAP::Tokenizer::Token->new;
$token->id($s->{-id}) if $s && $s->{-id};
$token->pos($pos);
diff --git a/t/artificial.t b/t/artificial.t
index 2addd6b..e4e5282 100644
--- a/t/artificial.t
+++ b/t/artificial.t
@@ -277,7 +277,7 @@
ok($tokens = new_tokenizer->parse, 'New Tokenizer');
# Add XIP/Sentences
-ok($tokens->add('XIP', 'Dependency'), 'Add XIP/Sentences');
+ok($tokens->add('XIP', 'Dependency'), 'Add XIP/Dependency');
$stream = $tokens->stream;
like($stream->pos(1)->to_string, qr!\|>:xip/d:NMOD\$<i>3!, 'Dependency fine');
@@ -311,6 +311,72 @@
like($stream->pos(16)->to_string, qr!\|<xip/d:VMAIN\$<i>16!, 'Dependency fine');
like($stream->pos(17)->to_string, qr!\|<:xip/d:AUXIL\$<i>16!, 'Dependency fine');
+# New instantiation
+ok($tokens = new_tokenizer->parse, 'New Tokenizer');
+
+# Add XIP/Sentences
+ok($tokens->add('XIP', 'Constituency'), 'Add XIP/Constituency');
+
+$stream = $tokens->stream;
+like($stream->pos(0)->to_string, qr!\|<>:xip/c:TOP#0-129\$<i>17!, 'Constituency fine');
+like($stream->pos(0)->to_string, qr!\|<>:xip/c:MC#0-129\$<i>17<b>1!, 'Constituency fine');
+like($stream->pos(0)->to_string, qr!\|<>:xip/c:PP#0-30\$<i>4<b>2!, 'Constituency fine');
+like($stream->pos(0)->to_string, qr!\|<>:xip/c:PREP#0-3\$<i>1!, 'Constituency fine');
+
+like($stream->pos(1)->to_string, qr!\|<>:xip/c:NP#4-30\$<i>4<b>3!, 'Constituency fine');
+like($stream->pos(1)->to_string, qr!\|<>:xip/c:NPA#4-30\$<i>4<b>4!, 'Constituency fine');
+like($stream->pos(1)->to_string, qr!\|<>:xip/c:AP#4-11\$<i>2<b>5!, 'Constituency fine');
+like($stream->pos(1)->to_string, qr!\|<>:xip/c:ADJ#4-11\$<i>2<b>6!, 'Constituency fine');
+
+like($stream->pos(2)->to_string, qr!\|<>:xip/c:AP#12-23\$<i>3<b>5!, 'Constituency fine');
+like($stream->pos(2)->to_string, qr!\|<>:xip/c:ADJ#12-23\$<i>3<b>6!, 'Constituency fine');
+
+like($stream->pos(3)->to_string, qr!\|<>:xip/c:NOUN#24-30\$<i>4<b>5!, 'Constituency fine');
+
+like($stream->pos(4)->to_string, qr!\|<>:xip/c:VERB#31-35\$<i>5<b>2!, 'Constituency fine');
+
+like($stream->pos(5)->to_string, qr!\|<>:xip/c:NP#36-47\$<i>7<b>2!, 'Constituency fine');
+like($stream->pos(5)->to_string, qr!\|<>:xip/c:DET#36-39\$<i>6<b>3!, 'Constituency fine');
+
+like($stream->pos(6)->to_string, qr!\|<>:xip/c:NPA#40-47\$<i>7<b>3!, 'Constituency fine');
+like($stream->pos(6)->to_string, qr!\|<>:xip/c:NOUN#40-47\$<i>7<b>4!, 'Constituency fine');
+
+like($stream->pos(7)->to_string, qr!\|<>:xip/c:NP#48-63\$<i>9<b>2!, 'Constituency fine');
+like($stream->pos(7)->to_string, qr!\|<>:xip/c:DET#48-51\$<i>8<b>3!, 'Constituency fine');
+
+like($stream->pos(8)->to_string, qr!\|<>:xip/c:NPA#52-63\$<i>9<b>3!, 'Constituency fine');
+like($stream->pos(8)->to_string, qr!\|<>:xip/c:NOUN#52-63\$<i>9<b>4!, 'Constituency fine');
+
+like($stream->pos(9)->to_string, qr!\|<>:xip/c:NP#64-73\$<i>10<b>2!, 'Constituency fine');
+like($stream->pos(9)->to_string, qr!\|<>:xip/c:NPA#64-73\$<i>10<b>3!, 'Constituency fine');
+like($stream->pos(9)->to_string, qr!\|<>:xip/c:NOUN#64-73\$<i>10<b>4!, 'Constituency fine');
+
+like($stream->pos(10)->to_string, qr!\|<>:xip/c:PTCL#74-77\$<i>11<b>2!, 'Constituency fine');
+
+like($stream->pos(11)->to_string, qr!\|<>:xip/c:SC#79-128\$<i>18!, 'Constituency fine');
+like($stream->pos(11)->to_string, qr!\|<>:xip/c:CONJ#79-84\$<i>12<b>1!, 'Constituency fine');
+
+like($stream->pos(12)->to_string, qr!\|<>:xip/c:NP#85-96\$<i>14<b>1!, 'Constituency fine');
+like($stream->pos(12)->to_string, qr!\|<>:xip/c:DET#85-88\$<i>13<b>2!, 'Constituency fine');
+
+
+like($stream->pos(13)->to_string, qr!\|<>:xip/c:NPA#89-96\$<i>14<b>2!, 'Constituency fine');
+like($stream->pos(13)->to_string, qr!\|<>:xip/c:NOUN#89-96\$<i>14<b>3!, 'Constituency fine');
+
+like($stream->pos(14)->to_string, qr!\|<>:xip/c:NP#97-101\$<i>15<b>1!, 'Constituency fine');
+like($stream->pos(14)->to_string, qr!\|<>:xip/c:NPA#97-101\$<i>15<b>2!, 'Constituency fine');
+like($stream->pos(14)->to_string, qr!\|<>:xip/c:NOUN#97-101\$<i>15<b>3!, 'Constituency fine');
+
+like($stream->pos(15)->to_string, qr!\|<>:xip/c:NP#102-111\$<i>16<b>1!, 'Constituency fine');
+like($stream->pos(15)->to_string, qr!\|<>:xip/c:NPA#102-111\$<i>16<b>2!, 'Constituency fine');
+like($stream->pos(15)->to_string, qr!\|<>:xip/c:NOUN#102-111\$<i>16<b>3!, 'Constituency fine');
+
+like($stream->pos(16)->to_string, qr!\|<>:xip/c:VERB#112-123\$<i>17<b>1!, 'Constituency fine');
+
+like($stream->pos(17)->to_string, qr!\|<>:xip/c:VERB#124-128\$<i>18<b>1!, 'Constituency fine');
+
+# diag $stream->to_string;
+
# ADJA ADJA NN VVFIN ART NN ART NN NE PTKVZ KOUS ART NN NN NN VVPP VAFIN
done_testing;
diff --git a/t/transform.t b/t/transform.t
index 63bc5a3..7387fe3 100644
--- a/t/transform.t
+++ b/t/transform.t
@@ -110,7 +110,7 @@
ok($tokens->add(@$_), 'Add '. join(', ', @$_));
};
-is($tokens->stream->pos(0)->to_string, '[(0-1)s:A|i:a|_0#0-1|-:tokens$<i>923|mate/p:XY|<>:base/s#0-74$<i>13|<>:base/text#0-6083$<i>923|-:base/sentences$<i>96|<>:base/para#0-224$<i>34|-:base/paragraphs$<i>76|opennlp/p:NE|<>:opennlp/s#0-74$<i>13|-:opennlp/sentences$<i>50|<>:corenlp/s#0-6$<i>2|-:corenlp/sentences$<i>65|cnx/l:A|cnx/p:N|cnx/syn:@NH|<>:cnx/s#0-74$<i>13|-:cnx/sentences$<i>62|tt/l:A|tt/p:NN|tt/l:A|tt/p:FM|<>:tt/s#0-6083$<i>923|-:tt/sentences$<i>1|>:mate/d:PNC$<i>2|xip/p:SYMBOL|xip/l:A|<>:xip/c:TOP#0-74$<i>13|<>:xip/c:MC#0-73$<i>13<b>1|>:xip/d:SUBJ$<i>3|<:xip/d:COORD$<i>1|<>:xip/s#0-74$<i>13|-:xip/sentences$<i>64]', 'Startinfo');
+is($tokens->stream->pos(0)->to_string, '[(0-1)s:A|i:a|_0#0-1|-:tokens$<i>923|mate/p:XY|<>:base/s#0-74$<i>13|<>:base/text#0-6083$<i>923|-:base/sentences$<i>96|<>:base/para#0-224$<i>34|-:base/paragraphs$<i>76|opennlp/p:NE|<>:opennlp/s#0-74$<i>13|-:opennlp/sentences$<i>50|<>:corenlp/s#0-6$<i>2|-:corenlp/sentences$<i>65|cnx/l:A|cnx/p:N|cnx/syn:@NH|<>:cnx/c:np#0-1$<i>1|<>:cnx/s#0-74$<i>13|-:cnx/sentences$<i>62|tt/l:A|tt/p:NN|tt/l:A|tt/p:FM|<>:tt/s#0-6083$<i>923|-:tt/sentences$<i>1|>:mate/d:PNC$<i>2|xip/p:SYMBOL|xip/l:A|<>:xip/c:TOP#0-74$<i>13|<>:xip/c:MC#0-73$<i>13<b>1|<>:xip/c:NP#0-1$<i>1<b>2|<>:xip/c:NPA#0-1$<i>1<b>3|<>:xip/c:NOUN#0-1$<i>1<b>4|<>:xip/c:SYMBOL#0-1$<i>1<b>5|>:xip/d:SUBJ$<i>3|<:xip/d:COORD$<i>1|<>:xip/s#0-74$<i>13|-:xip/sentences$<i>64]', 'Startinfo');
is($tokens->stream->pos(118)->to_string,