Fixed payloads, sorted tokens, major speed improvements
diff --git a/lib/KorAP/Field/MultiTerm.pm b/lib/KorAP/Field/MultiTerm.pm
index 8210c56..0f93787 100644
--- a/lib/KorAP/Field/MultiTerm.pm
+++ b/lib/KorAP/Field/MultiTerm.pm
@@ -1,11 +1,122 @@
package KorAP::Field::MultiTerm;
-use Mojo::Base -base;
+use strict;
+use warnings;
use MIME::Base64;
-has [qw/p_start p_end o_start o_end term payload/];
-has store_offsets => 1;
+sub new {
+ my $self = bless [], shift;
+ my $i = 0;
+ for (; $i < scalar @_; $i+=2) {
+ if ($_[$i] eq 'term') {
+ $self->term($_[$i+1]);
+ }
+ elsif ($_[$i] eq 'p_start') {
+ $self->p_start($_[$i+1]);
+ }
+ elsif ($_[$i] eq 'p_end') {
+ $self->p_end($_[$i+1]);
+ }
+ elsif ($_[$i] eq 'payload') {
+ $self->payload($_[$i+1]);
+ }
+ elsif ($_[$i] eq 'store_offsets') {
+ $self->store_offsets($_[$i+1]);
+ }
+ elsif ($_[$i] eq 'o_start') {
+ $self->o_start($_[$i+1]);
+ }
+ elsif ($_[$i] eq 'o_end') {
+ $self->o_end($_[$i+1]);
+ };
+ };
+ $self;
+};
+# 0
+sub payload {
+ if (defined $_[1]) {
+ return $_[0]->[0] = $_[1];
+ };
+ $_[0]->[0];
+};
+
+# 1
+sub p_start {
+ if (defined $_[1]) {
+ return $_[0]->[1] = $_[1];
+ };
+ $_[0]->[1];
+};
+
+# 2
+sub p_end {
+ if (defined $_[1]) {
+ return $_[0]->[2] = $_[1];
+ };
+ $_[0]->[2];
+};
+
+# 3
+sub o_start {
+ if (defined $_[1]) {
+ return $_[0]->[3] = $_[1];
+ };
+ $_[0]->[3];
+};
+
+# 4
+sub o_end {
+ if (defined $_[1]) {
+ return $_[0]->[4] = $_[1];
+ };
+ $_[0]->[4];
+};
+
+# 5
+sub term {
+ if (defined $_[1]) {
+ return $_[0]->[5] = $_[1];
+ };
+ $_[0]->[5];
+};
+
+# 6
+sub store_offsets {
+ if (defined $_[1]) {
+ return $_[0]->[6] = $_[1];
+ };
+ $_[0]->[6];
+};
+
+
+# to string based on array
sub to_string {
+ my $string = $_[0]->[5];
+ if (defined $_[0]->[3]) {
+ $string .= '#' .$_[0]->[3] .'-' . $_[0]->[4];
+ };
+
+ my $pl = $_[0]->[1] ? $_[0]->[1] - 1 : $_[0]->[0];
+ if ($_[0]->[2] || $_[0]->[0]) {
+ $string .= '$';
+ if ($_[0]->[2]) {
+ $string .= '<i>' . $_[0]->[2];
+ };
+ if ($_[0]->[0]) {
+ if (index($_[0]->[0], '<') == 0) {
+ $string .= $_[0]->[0];
+ }
+ else {
+ $string .= '<?>' . $_[0]->[0];
+ };
+ };
+ };
+
+ $string;
+};
+
+
+sub to_string_2 {
my $self = shift;
my $string = $self->term;
if (defined $self->o_start) {
@@ -31,6 +142,9 @@
return $string;
};
+
+
+
sub to_solr {
my $self = shift;
my $increment = shift;
diff --git a/lib/KorAP/Field/MultiTermToken.pm b/lib/KorAP/Field/MultiTermToken.pm
index 0d8742b..fb83c1a 100644
--- a/lib/KorAP/Field/MultiTermToken.pm
+++ b/lib/KorAP/Field/MultiTermToken.pm
@@ -1,56 +1,196 @@
package KorAP::Field::MultiTermToken;
use KorAP::Field::MultiTerm;
-use Mojo::Base -base;
use List::MoreUtils 'uniq';
+use strict;
+use warnings;
-has [qw/o_start o_end/];
+# This tries to be highly optimized - it's not supposed to be readable
+
+sub new {
+ bless [], shift;
+};
sub add {
my $self = shift;
my $mt;
unless (ref $_[0] eq 'MultiTerm') {
if (@_ == 1) {
- $mt = KorAP::Field::MultiTerm->new(term => shift());
+ $mt = KorAP::Field::MultiTerm->new(term => $_[0]);
}
else {
$mt = KorAP::Field::MultiTerm->new(@_);
};
}
else {
- $mt = shift;
+ $mt = $_[0];
};
- $self->{mt} //= [];
- push(@{$self->{mt}}, $mt);
- return $mt;
+ $self->[0] //= [];
+ push(@{$self->[0]}, $mt);
+ $mt;
};
-# Return a new term id
+# 0 -> mt
+
+# 1
+sub o_start {
+ if (defined $_[1]) {
+ return $_[0]->[1] = $_[1];
+ };
+ $_[0]->[1];
+};
+
+# 2
+sub o_end {
+ if (defined $_[1]) {
+ return $_[0]->[2] = $_[1];
+ };
+ $_[0]->[2];
+};
+
+# 3: Return a new term id
sub id_counter {
- $_[0]->{id_counter} //= 1;
- return $_[0]->{id_counter}++;
+ $_[0]->[3] //= 1;
+ return $_[0]->[3]++;
};
-
sub surface {
- substr($_[0]->{mt}->[0]->term,2);
+ substr($_[0]->[0]->[0]->term,2);
};
sub lc_surface {
- substr($_[0]->{mt}->[1]->term,2);
+ substr($_[0]->[0]->[1]->term,2);
};
+sub to_array {
+ my $self = shift;
+ [uniq(map($_->to_string, sort _sort @{$self->[0]}))];
+};
+
+
sub to_string {
my $self = shift;
my $string = '[(' . $self->o_start . '-'. $self->o_end . ')';
- $string .= join ('|', map($_->to_string, @{$self->{mt}}));
+ $string .= join ('|', @{$self->to_array});
$string .= ']';
return $string;
};
+# Get relation based positions
+sub _rel_right_pos {
+ # token to token - right token
+ if ($_[0] =~ m/^<i>(\d+)<s>/o) {
+ return ($1, $1);
+ }
+ # token/span to span - right token
+ elsif ($_[0] =~ m/^<i>(\d+)<i>(\d+)<s>/o) {
+ return ($1, $2);
+ }
+ # span to token - right token
+ elsif ($_[0] =~ m/^<b>\d+<i>(\d+)<s>/o) {
+ return ($1, $1);
+ };
+ warn 'Unknown relation format!';
+ return (0,0);
+};
-sub to_array {
- my $self = shift;
- [uniq(map($_->to_string, @{$self->{mt}}))];
+# Sort spans, attributes and relations
+sub _sort {
+
+ # Both are no spans
+ if (index($a->[5], '<>:') != 0 && index($b->[5], '<>:') != 0) {
+
+ # Both are attributes
+ # Order attributes by reference id
+ if (index($a->[5], '@:') == 0 && index($b->[5], '@:') == 0) {
+ my ($a_id) = ($a->[0] =~ m/^<s>(\d+)/);
+ my ($b_id) = ($b->[0] =~ m/^<s>(\d+)/);
+ if ($a_id > $b_id) {
+ return 1;
+ }
+ elsif ($a_id < $b_id) {
+ return -1;
+ }
+ else {
+ return 1;
+ };
+ }
+
+ # Both are relations
+ elsif (
+ (index($a->[5],'<:') == 0 || index($a->[5],'>:') == 0) &&
+ (index($b->[5], '<:') == 0 || index($b->[5],'>:') == 0)) {
+ my $a_end = $a->[2] // 0;
+ my $b_end = $b->[2] // 0;
+
+ # left is p_end
+ if ($a_end < $b_end) {
+ return -1;
+ }
+ elsif ($a_end > $b_end) {
+ return 1;
+ }
+ else {
+ # Check for right positions
+ (my $a_start, $a_end) = _rel_right_pos($a->[0]);
+ (my $b_start, $b_end) = _rel_right_pos($b->[0]);
+ if ($a_start < $b_start) {
+ return -1;
+ }
+ elsif ($a_start > $b_start) {
+ return 1;
+ }
+ elsif ($a_end < $b_end) {
+ return -1;
+ }
+ elsif ($a_end > $b_end) {
+ return 1;
+ }
+ else {
+ return 1;
+ };
+ };
+ };
+
+ # This has to be sorted alphabetically!
+ return $a->[5] cmp $b->[5];
+ }
+
+ # Not identical
+ elsif (index($a->[5], '<>:') != 0) {
+ return $a->[5] cmp $b->[5];
+ }
+ # Not identical
+ elsif (index($b->[5], '<>:') != 0) {
+ return $a->[5] cmp $b->[5];
+ }
+
+ # Sort both spans
+ else {
+ if ($a->[2] < $b->[2]) {
+ return -1;
+ }
+ elsif ($a->[2] > $b->[2]) {
+ return 1;
+ }
+
+ # Check depth
+ else {
+ my ($a_depth) = ($a->[0] =~ m/^<b>(\d+)/);
+ my ($b_depth) = ($b->[0] =~ m/^<b>(\d+)/);
+
+ $a_depth //= 0;
+ $b_depth //= 0;
+ if ($a_depth < $b_depth) {
+ return -1;
+ }
+ elsif ($a_depth > $b_depth) {
+ return 1;
+ }
+ else {
+ return 1;
+ };
+ };
+ };
};
diff --git a/lib/KorAP/Field/MultiTermTokenStream.pm b/lib/KorAP/Field/MultiTermTokenStream.pm
index f9e97a2..47524ab 100644
--- a/lib/KorAP/Field/MultiTermTokenStream.pm
+++ b/lib/KorAP/Field/MultiTermTokenStream.pm
@@ -34,6 +34,7 @@
$_[0]->{mtt};
};
+
sub to_array {
my $self = shift;
[ map { $_->to_array } @{$self->{mtt}} ];
diff --git a/lib/KorAP/Index/Base/Paragraphs.pm b/lib/KorAP/Index/Base/Paragraphs.pm
index 105bd59..e5386f4 100644
--- a/lib/KorAP/Index/Base/Paragraphs.pm
+++ b/lib/KorAP/Index/Base/Paragraphs.pm
@@ -14,7 +14,8 @@
term => '<>:base/s:p',
o_start => $span->o_start,
o_end => $span->o_end,
- p_end => $span->p_end
+ p_end => $span->p_end,
+ payload => '<b>1'
);
$i++;
}
diff --git a/lib/KorAP/Index/Base/Sentences.pm b/lib/KorAP/Index/Base/Sentences.pm
index f1ca6f9..6d485ad 100644
--- a/lib/KorAP/Index/Base/Sentences.pm
+++ b/lib/KorAP/Index/Base/Sentences.pm
@@ -18,7 +18,8 @@
term => '<>:base/s:s',
o_start => $span->o_start,
o_end => $span->o_end,
- p_end => $span->p_end
+ p_end => $span->p_end,
+ payload => '<b>2'
);
$last_p = $span->p_end;
$last_o = $span->o_end;
@@ -31,7 +32,8 @@
term => '<>:base/s:t',
o_start => $first->[1],
p_end => $last_p,
- o_end => $last_o
+ o_end => $last_o,
+ payload => '<b>0'
);
$$self->stream->add_meta('base/sentences', '<i>' . $i);
diff --git a/lib/KorAP/Index/Connexor/Phrase.pm b/lib/KorAP/Index/Connexor/Phrase.pm
index ed36de3..00a1b0d 100644
--- a/lib/KorAP/Index/Connexor/Phrase.pm
+++ b/lib/KorAP/Index/Connexor/Phrase.pm
@@ -22,7 +22,8 @@
term => '<>:cnx/c:' . $type,
o_start => $span->o_start,
o_end => $span->o_end,
- p_end => $span->p_end
+ p_end => $span->p_end,
+ payload => '<b>0' # Pseudo-depth
);
};
}
diff --git a/lib/KorAP/Index/Connexor/Sentences.pm b/lib/KorAP/Index/Connexor/Sentences.pm
index ac6f89f..db95729 100644
--- a/lib/KorAP/Index/Connexor/Sentences.pm
+++ b/lib/KorAP/Index/Connexor/Sentences.pm
@@ -15,7 +15,8 @@
term => '<>:cnx/s:s',
o_start => $span->o_start,
o_end => $span->o_end,
- p_end => $span->p_end
+ p_end => $span->p_end,
+ payload => '<b>2'
);
$i++;
}
diff --git a/lib/KorAP/Index/CoreNLP/Constituency.pm b/lib/KorAP/Index/CoreNLP/Constituency.pm
index 4793bfd..ee37abc 100644
--- a/lib/KorAP/Index/CoreNLP/Constituency.pm
+++ b/lib/KorAP/Index/CoreNLP/Constituency.pm
@@ -1,7 +1,6 @@
package KorAP::Index::CoreNLP::Constituency;
use KorAP::Index::Base;
use Set::Scalar;
-use v5.16;
sub parse {
my $self = shift;
@@ -25,8 +24,14 @@
$rel = [$rel] unless ref $rel eq 'ARRAY';
foreach (@$rel) {
- if ($_->{-label} eq 'dominates' && $_->{-target}) {
- $corenlp_const_noroot->insert($_->{-target});
+ if ($_->{-label} eq 'dominates') {
+ if ($_->{-target}) {
+ $corenlp_const_noroot->insert($_->{-target});
+ }
+ elsif (my $uri = $_->{-uri}) {
+ $uri =~ s/^morpho\.xml#//;
+ $corenlp_const_noroot->insert($uri);
+ };
};
};
}
@@ -34,7 +39,8 @@
my $stream = $$self->stream;
- my $add_const = sub {
+ my $add_const;
+ $add_const = sub {
my $span = shift;
my $level = shift;
my $mtt = $stream->pos($span->p_start);
@@ -53,11 +59,11 @@
p_end => $span->p_end
);
- $term{payload} = '<b>' . $level if $level;
+ $term{payload} = '<b>' . ($level // 0);
$mtt->add(%term);
- my $this = __SUB__;
+ my $this = $add_const;
my $rel = $content->{rel} or return;
$rel = [$rel] unless ref $rel eq 'ARRAY';
@@ -79,7 +85,7 @@
};
sub layer_info {
- ['corenlp/c=const']
+ ['corenlp/c=spans']
}
1;
diff --git a/lib/KorAP/Index/CoreNLP/Sentences.pm b/lib/KorAP/Index/CoreNLP/Sentences.pm
index 0f40213..cacc2b0 100644
--- a/lib/KorAP/Index/CoreNLP/Sentences.pm
+++ b/lib/KorAP/Index/CoreNLP/Sentences.pm
@@ -15,7 +15,8 @@
term => '<>:corenlp/s:s',
o_start => $span->o_start,
o_end => $span->o_end,
- p_end => $span->p_end
+ p_end => $span->p_end,
+ payload => '<b>2'
);
$i++;
}
diff --git a/lib/KorAP/Index/Mate/Morpho.pm b/lib/KorAP/Index/Mate/Morpho.pm
index 035bc37..1a06f63 100644
--- a/lib/KorAP/Index/Mate/Morpho.pm
+++ b/lib/KorAP/Index/Mate/Morpho.pm
@@ -13,69 +13,6 @@
my $content = $token->hash->{fs}->{f};
- my ($found, $pos, $msd, $id);
-
- my $capital = 0;
-
- foreach my $f (@{$content->{fs}->{f}}) {
- #pos
- if (($f->{-name} eq 'pos') && ($found = $f->{'#text'})) {
- $pos = $found;
- }
-
- # lemma
- elsif (($f->{-name} eq 'lemma')
- && ($found = $f->{'#text'})
- && $found ne '--') {
- $mtt->add(term => 'mate/l:' . $found);
- }
-
- # MSD
- elsif (($f->{-name} eq 'msd') &&
- ($found = $f->{'#text'}) &&
- ($found ne '_')) {
- $msd = $found;
- $id = $mtt->id_counter;
- };
- };
-
- $mtt->add(term => 'mate/m:' . $pos . ($id ? ('$<s>' . $id) : ''));
-
- # MSD
- if ($msd) {
- foreach (split '\|', $msd) {
- my ($x, $y) = split "=", $_;
- # case, tense, number, mood, person, degree, gender
- $mtt->add(term => '@:' . $x . ($y ? '=' . $y : '') . '$<s>' . $id);
- };
- };
- }) or return;
-
- return 1;
-};
-
-sub layer_info {
- ['mate/l=tokens', 'mate/m=tokens']
-};
-
-1;
-
-
-__END__
-
-
-sub parse {
- my $self = shift;
-
- $$self->add_tokendata(
- foundry => 'mate',
- layer => 'morpho',
- cb => sub {
- my ($stream, $token) = @_;
- my $mtt = $stream->pos($token->pos);
-
- my $content = $token->hash->{fs}->{f};
-
my $found;
my $capital = 0;
diff --git a/lib/KorAP/Index/Mate/Morpho2.pm b/lib/KorAP/Index/Mate/Morpho2.pm
new file mode 100644
index 0000000..e032f2f
--- /dev/null
+++ b/lib/KorAP/Index/Mate/Morpho2.pm
@@ -0,0 +1,63 @@
+package KorAP::Index::Mate::Morpho;
+use KorAP::Index::Base;
+
+# This attaches morphological information as attributes to the pos
+
+sub parse {
+ my $self = shift;
+
+ $$self->add_tokendata(
+ foundry => 'mate',
+ layer => 'morpho',
+ cb => sub {
+ my ($stream, $token) = @_;
+ my $mtt = $stream->pos($token->pos);
+
+ my $content = $token->hash->{fs}->{f};
+
+ my ($found, $pos, $msd, $id);
+
+ my $capital = 0;
+
+ foreach my $f (@{$content->{fs}->{f}}) {
+ #pos
+ if (($f->{-name} eq 'pos') && ($found = $f->{'#text'})) {
+ $pos = $found;
+ }
+
+ # lemma
+ elsif (($f->{-name} eq 'lemma')
+ && ($found = $f->{'#text'})
+ && $found ne '--') {
+ $mtt->add(term => 'mate/l:' . $found);
+ }
+
+ # MSD
+ elsif (($f->{-name} eq 'msd') &&
+ ($found = $f->{'#text'}) &&
+ ($found ne '_')) {
+ $msd = $found;
+ $id = $mtt->id_counter;
+ };
+ };
+
+ $mtt->add(term => 'mate/m:' . $pos . ($id ? ('$<s>' . $id) : ''));
+
+ # MSD
+ if ($msd) {
+ foreach (split '\|', $msd) {
+ my ($x, $y) = split "=", $_;
+ # case, tense, number, mood, person, degree, gender
+ $mtt->add(term => '@:' . $x . ($y ? '=' . $y : '') . '$<s>' . $id);
+ };
+ };
+ }) or return;
+
+ return 1;
+};
+
+sub layer_info {
+ ['mate/l=tokens', 'mate/m=tokens']
+};
+
+1;
diff --git a/lib/KorAP/Index/OpenNLP/Sentences.pm b/lib/KorAP/Index/OpenNLP/Sentences.pm
index 8710763..f4e84e9 100644
--- a/lib/KorAP/Index/OpenNLP/Sentences.pm
+++ b/lib/KorAP/Index/OpenNLP/Sentences.pm
@@ -15,7 +15,8 @@
term => '<>:opennlp/s:s',
o_start => $span->o_start,
o_end => $span->o_end,
- p_end => $span->p_end
+ p_end => $span->p_end,
+ payload => '<b>2' # t/p/s
);
$i++;
}
diff --git a/lib/KorAP/Index/TreeTagger/Morpho.pm b/lib/KorAP/Index/TreeTagger/Morpho.pm
index 81766c7..989728b 100644
--- a/lib/KorAP/Index/TreeTagger/Morpho.pm
+++ b/lib/KorAP/Index/TreeTagger/Morpho.pm
@@ -22,11 +22,11 @@
$content = $fs->{fs}->{f};
my @val;
- my $certainty = '';
+ my $certainty = 0;
foreach (@$content) {
if ($_->{-name} eq 'certainty') {
$certainty = floor(($_->{'#text'} * 255));
- $certainty = '$<b>' . $certainty if $certainty;
+ $certainty = $certainty if $certainty;
}
else {
push @val, $_
@@ -39,16 +39,20 @@
($found = $_->{'#text'}) &&
($found ne 'UNKNOWN') &&
($found ne '?')) {
- $mtt->add(
- term => 'tt/l:' . $found . $certainty
+ my %term = (
+ term => 'tt/l:' . $found
);
+ $term{payload} = '<b>' . $certainty if $certainty;
+ $mtt->add(%term);
};
# pos
if (($_->{-name} eq 'ctag') && ($found = $_->{'#text'})) {
- $mtt->add(
- term => 'tt/p:' . $found . $certainty
+ my %term = (
+ term => 'tt/p:' . $found
);
+ $term{payload} = '<b>' . $certainty if $certainty;
+ $mtt->add(%term);
};
};
};
diff --git a/lib/KorAP/Index/TreeTagger/Sentences.pm b/lib/KorAP/Index/TreeTagger/Sentences.pm
index 06669ea..37b49e0 100644
--- a/lib/KorAP/Index/TreeTagger/Sentences.pm
+++ b/lib/KorAP/Index/TreeTagger/Sentences.pm
@@ -15,7 +15,8 @@
term => '<>:tt/s:s',
o_start => $span->o_start,
o_end => $span->o_end,
- p_end => $span->p_end
+ p_end => $span->p_end,
+ payload => '<b>2' # Depth is 2 by default t/p/s
);
$i++;
}
diff --git a/lib/KorAP/Index/XIP/Constituency.pm b/lib/KorAP/Index/XIP/Constituency.pm
index 62a4a29..d181afd 100644
--- a/lib/KorAP/Index/XIP/Constituency.pm
+++ b/lib/KorAP/Index/XIP/Constituency.pm
@@ -34,6 +34,7 @@
my $rel = $span->hash->{rel} or return;
$rel = [$rel] unless ref $rel eq 'ARRAY';
+ # Iterate over all relations
foreach (@$rel) {
if ($_->{-label} eq 'dominates') {
@@ -43,6 +44,7 @@
$target = $1;
};
+ # The target may not be addressable
next unless $target;
# It's definately not a root
@@ -57,7 +59,7 @@
# Recursive tree traversal method
my $add_const;
- $add_const= sub {
+ $add_const = sub {
my ($span, $level) = @_;
weaken $xip_const_root;
@@ -68,6 +70,7 @@
my $content = $span->hash;
my $f = $content->{fs}->{f};
+
unless ($f->{-name} eq 'const') {
warn $f->{-id} . ' is no constant';
return;
@@ -89,7 +92,7 @@
);
# Only add level payload if node != root
- $term{payload} = '<b>' . $level if $level;
+ $term{payload} ='<b>' . ($level // 0);
$mtt->add(%term);
@@ -141,7 +144,7 @@
# Layer info
sub layer_info {
- ['xip/c=const']
-}
+ ['xip/c=spans']
+};
1;
diff --git a/lib/KorAP/Index/XIP/Morpho.pm b/lib/KorAP/Index/XIP/Morpho.pm
index 1eae6c6..474bef0 100644
--- a/lib/KorAP/Index/XIP/Morpho.pm
+++ b/lib/KorAP/Index/XIP/Morpho.pm
@@ -60,7 +60,7 @@
};
sub layer_info {
- ['xip/l=lemma', 'xip/p=pos']
+ ['xip/l=tokens', 'xip/p=tokens']
};
diff --git a/lib/KorAP/Index/XIP/Sentences.pm b/lib/KorAP/Index/XIP/Sentences.pm
index f045152..35ab6f1 100644
--- a/lib/KorAP/Index/XIP/Sentences.pm
+++ b/lib/KorAP/Index/XIP/Sentences.pm
@@ -18,7 +18,8 @@
term => '<>:xip/s:s',
o_start => $span->o_start,
o_end => $span->o_end,
- p_end => $span->p_end
+ p_end => $span->p_end,
+ payload => '<b>2'
);
$i++;
}
@@ -29,4 +30,9 @@
return 1;
};
+sub layer_info {
+ ['xip/s=spans'];
+};
+
+
1;
diff --git a/t/artificial.t b/t/artificial.t
index 5b5210f..10d4429 100644
--- a/t/artificial.t
+++ b/t/artificial.t
@@ -59,8 +59,9 @@
is($tokens->name, 'tokens', 'Name');
is($tokens->layer, 'Tokens', 'Layer');
-is($tokens->stream->pos(0)->to_string, '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18]', 'Token is correct');
-is($tokens->stream->pos(1)->to_string, '[(4-11)s:letzten|i:letzten|_1#4-11]', 'Token is correct');
+is($tokens->stream->pos(0)->to_string, '[(0-3)-:tokens$<i>18|_0#0-3|i:zum|s:Zum]', 'Token is correct');
+
+is($tokens->stream->pos(1)->to_string, '[(4-11)_1#4-11|i:letzten|s:letzten]', 'Token is correct');
my $i = 2;
foreach ([12,23, 'kulturellen'],
@@ -82,8 +83,8 @@
) {
is($tokens->stream->pos($i++)->to_string,
'[('.$_->[0].'-'.$_->[1].')'.
- 's:'.$_->[2].'|i:'.lc($_->[2]).'|'.
- '_'.($i-1).'#'.$_->[0].'-'.$_->[1].']',
+ '_'.($i-1).'#'.$_->[0].'-'.$_->[1] . '|' .
+ 'i:'.lc($_->[2]).'|s:'.$_->[2].']',
'Token is correct');
};
@@ -92,6 +93,7 @@
# Add OpenNLP/morpho
ok($tokens->add('OpenNLP', 'Morpho'), 'Add OpenNLP/Morpho');
+
$i = 0;
foreach (qw/APPRART ADJA ADJA NN VVFIN ART NN ART NN NE PTKVZ KOUS ART NN NN NN VVPP VAFIN/) {
like($tokens->stream->pos($i++)->to_string,
@@ -103,8 +105,7 @@
# Add OpenNLP/sentences
ok($tokens->add('OpenNLP', 'Sentences'), 'Add OpenNLP/Sentences');
-is($tokens->stream->pos(0)->to_string, '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|opennlp/p:APPRART|<>:opennlp/s:s#0-129$<i>17|-:opennlp/sentences$<i>1]', 'Correct sentence');
-
+is($tokens->stream->pos(0)->to_string, '[(0-3)-:opennlp/sentences$<i>1|-:tokens$<i>18|_0#0-3|i:zum|s:Zum|opennlp/p:APPRART|<>:opennlp/s:s#0-129$<i>17]', 'Correct sentence');
# New instantiation
ok($tokens = KorAP::Tokenizer->new(
@@ -124,7 +125,7 @@
ok($tokens->add('Base', 'Paragraphs'), 'Add Base/Paragraphs');
is($tokens->stream->pos(0)->to_string,
- '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|<>:base/s:s#0-129$<i>17|<>:base/s:t#0-129$<i>17|-:base/sentences$<i>1|-:base/paragraphs$<i>0]',
+ '[(0-3)-:base/paragraphs$<i>0|-:base/sentences$<i>1|-:tokens$<i>18|_0#0-3|i:zum|s:Zum|<>:base/s:t#0-129$<i>17<b>0|<>:base/s:s#0-129$<i>17<b>0]',
'Correct base annotation');
@@ -135,11 +136,11 @@
ok($tokens->add('CoreNLP', 'NamedEntities', 'ne_dewac_175m_600'), 'Add CoreNLP/NamedEntities');
ok($tokens->add('CoreNLP', 'NamedEntities', 'ne_hgc_175m_600'), 'Add CoreNLP/NamedEntities');
+# [(64-73)s:Hofbergli|i:hofbergli|_9#64-73|corenlp/ne_dewac_175m_600:I-LOC|corenlp/ne_hgc_175m_600:I-LOC]
is($tokens->stream->pos(9)->to_string,
- '[(64-73)s:Hofbergli|i:hofbergli|_9#64-73|corenlp/ne_dewac_175m_600:I-LOC|corenlp/ne_hgc_175m_600:I-LOC]',
+ '[(64-73)_9#64-73|corenlp/ne:I-LOC|i:hofbergli|s:Hofbergli]',
'Correct NamedEntities annotation');
-
# New instantiation
ok($tokens = new_tokenizer->parse, 'Parse');
@@ -147,7 +148,7 @@
ok($tokens->add('CoreNLP', 'Morpho'), 'Add CoreNLP/Morpho');
is($tokens->stream->pos(0)->to_string,
- '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|corenlp/p:APPRART]',
+ '[(0-3)-:tokens$<i>18|_0#0-3|i:zum|s:Zum|corenlp/p:APPRART]',
'Correct corenlp annotation');
$i = 0;
@@ -161,10 +162,9 @@
ok($tokens->add('CoreNLP', 'Sentences'), 'Add CoreNLP/Sentences');
is($tokens->stream->pos(0)->to_string,
- '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|corenlp/p:APPRART|<>:corenlp/s:s#0-129$<i>17|-:corenlp/sentences$<i>1]',
+ '[(0-3)-:corenlp/sentences$<i>1|-:tokens$<i>18|_0#0-3|i:zum|s:Zum|corenlp/p:APPRART|<>:corenlp/s:s#0-129$<i>17]',
'Correct corenlp annotation');
-
# New instantiation
ok($tokens = new_tokenizer->parse, 'New Tokenizer');
@@ -172,7 +172,7 @@
ok($tokens->add('Connexor', 'Sentences'), 'Add Connexor/Sentences');
is($tokens->stream->pos(0)->to_string,
- '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|<>:cnx/s:s#0-129$<i>17|-:cnx/sentences$<i>1]',
+ '[(0-3)-:cnx/sentences$<i>1|-:tokens$<i>18|_0#0-3|i:zum|s:Zum|<>:cnx/s:s#0-129$<i>17<b>0]',
'Correct cnx annotation');
# New instantiation
@@ -192,6 +192,7 @@
'Annotation (Connexor/p) is correct: ' . $_);
};
+
$i = 0;
foreach (qw/! ! ! ! IND:PRES ! ! ! ! Prop ! ! ! ! ! ! PCP:PERF IND:PRES/) {
if ($_ eq '!') {
@@ -212,10 +213,10 @@
# Add Connexor/Phrase
ok($tokens->add('Connexor', 'Phrase'), 'Add Connexor/Phrase');
my $stream = $tokens->stream;
-like($stream->pos(1)->to_string, qr!\|<>:cnx/c:np#4-30\$<i>4!, 'Annotation (Connexor/c) is correct');
-like($stream->pos(6)->to_string, qr!\|<>:cnx/c:np#40-47\$<i>7!, 'Annotation (Connexor/c) is correct');
-like($stream->pos(8)->to_string, qr!\|<>:cnx/c:np#52-73\$<i>10!, 'Annotation (Connexor/c) is correct');
-like($stream->pos(13)->to_string, qr!\|<>:cnx/c:np#89-111\$<i>16!, 'Annotation (Connexor/c) is correct');
+like($stream->pos(1)->to_string, qr!\|<>:cnx/c:np#4-30\$<i>4<b>0!, 'Annotation (Connexor/c) is correct');
+like($stream->pos(6)->to_string, qr!\|<>:cnx/c:np#40-47\$<i>7<b>0!, 'Annotation (Connexor/c) is correct');
+like($stream->pos(8)->to_string, qr!\|<>:cnx/c:np#52-73\$<i>10<b>0!, 'Annotation (Connexor/c) is correct');
+like($stream->pos(13)->to_string, qr!\|<>:cnx/c:np#89-111\$<i>16<b>0!, 'Annotation (Connexor/c) is correct');
# New instantiation
ok($tokens = new_tokenizer->parse, 'New Tokenizer');
@@ -242,7 +243,7 @@
# Add XIP/Sentences
ok($tokens->add('XIP', 'Sentences'), 'Add XIP/Sentences');
-is($tokens->stream->pos(0)->to_string, '[(0-3)s:Zum|i:zum|_0#0-3|-:tokens$<i>18|<>:xip/s:s#0-129$<i>17|-:xip/sentences$<i>1]', 'First sentence');
+is($tokens->stream->pos(0)->to_string, '[(0-3)-:tokens$<i>18|_0#0-3|i:zum|s:Zum|-:xip/sentences$<i>1|<>:xip/s:s#0-129$<i>17<b>0]', 'First sentence');
# Add XIP/Morpho
ok($tokens->add('XIP', 'Morpho'), 'Add XIP/Morpho');
diff --git a/t/real_goethe.t b/t/real_goethe.t
index e2d6984..15fa677 100644
--- a/t/real_goethe.t
+++ b/t/real_goethe.t
@@ -7,6 +7,10 @@
use Data::Dumper;
use JSON::XS;
+use Benchmark qw/:hireswallclock/;
+
+my $t = Benchmark->new;
+
use utf8;
use lib 'lib', '../lib';
@@ -67,7 +71,7 @@
is($output->{foundries}, '', 'Foundries');
is($output->{layerInfos}, '', 'layerInfos');
-is($output->{data}->[0]->[0], 's:Autobiographische', 'data');
+is($output->{data}->[0]->[3], 's:Autobiographische', 'data');
is($output->{textSigle}, 'GOE_AGA.03828', 'Correct text sigle');
is($output->{docSigle}, 'GOE_AGA', 'Correct document sigle');
@@ -101,8 +105,8 @@
my $first_token = join('||', @{$output->{data}->[0]});
like($first_token, qr/s:Autobiographische/, 'data');
like($first_token, qr/_0#0-17/, 'data');
-like($first_token, qr!<>:base/s:s#0-30\$<i>2!, 'data');
-like($first_token, qr!<>:base\/s:t#0-35199\$<i>5226!, 'data');
+like($first_token, qr!<>:base/s:s#0-30\$<i>2<b>2!, 'data');
+like($first_token, qr!<>:base\/s:t#0-35199\$<i>5226<b>0!, 'data');
## OpenNLP
$tokens->add('OpenNLP', 'Sentences');
@@ -132,8 +136,7 @@
'Foundries');
is($output->{layerInfos}, 'base/s=spans opennlp/p=tokens opennlp/s=spans tt/s=spans', 'layerInfos');
$first_token = join('||', @{$output->{data}->[0]});
-like($first_token, qr!<>:tt/s:s#0-179\$<i>21!, 'data');
-
+like($first_token, qr!<>:tt/s:s#0-179\$<i>21<b>2!, 'data');
$tokens->add('TreeTagger', 'Morpho');
$output = decode_json( $tokens->to_json );
@@ -147,7 +150,6 @@
like($first_token, qr!tt/l:Autobiographische\$<b>89!, 'data');
like($first_token, qr!tt/p:NN\$<b>89!, 'data');
-
## CoreNLP
$tokens->add('CoreNLP', 'NamedEntities');
$output = decode_json( $tokens->to_json );
@@ -171,7 +173,6 @@
$first_token = join('||', @{$output->{data}->[0]});
like($first_token, qr!<>:corenlp/s:s#0-254\$<i>32!, 'data');
-
$tokens->add('CoreNLP', 'Morpho');
$output = decode_json( $tokens->to_json );
like($output->{foundries}, qr!corenlp/morpho!, 'Foundries');
@@ -179,7 +180,33 @@
$first_token = join('||', @{$output->{data}->[0]});
like($first_token, qr!corenlp/p:ADJA!, 'data');
-fail('Check for Constiuency!');
+$tokens->add('CoreNLP', 'Constituency');
+$output = decode_json( $tokens->to_json );
+like($output->{foundries}, qr!corenlp/constituency!, 'Foundries');
+like($output->{layerInfos}, qr!corenlp/c=spans!, 'layerInfos');
+$first_token = join('||', @{$output->{data}->[0]});
+
+# '<>:corenlp/c:ADJA#0-17$<i>1<b>0',
+# '<>:corenlp/c:NP#0-17$<i>1<b>0',
+# '<>:corenlp/c:CNP#0-17$<i>1<b>1',
+# '<>:corenlp/c:NP#0-17$<i>1<b>2',
+# '<>:corenlp/c:AP#0-17$<i>1<b>3',
+# '<>:corenlp/c:PP#0-58$<i>5<b>2',
+# '<>:corenlp/c:S#0-58$<i>5<b>3',
+# '<>:corenlp/c:ROOT#0-254$<i>32<b>0',
+# '<>:corenlp/c:S#0-254$<i>32<b>1',
+
+#like($first_token, qr!<>:corenlp/c:ADJA#0-17$<i>1<b>0!, 'data');
+#like($first_token, qr!<>:corenlp/c:NP#0-17$<i>1<b>0!, 'data');
+
+
+
+diag Dumper $output->{data}->[0];
+
+
+done_testing;
+__END__
+
## Glemm
$tokens->add('Glemm', 'Morpho');
@@ -193,15 +220,13 @@
like($first_token, qr!glemm/l:\+\+Biograph!, 'data');
like($first_token, qr!glemm/l:\+\+-isch!, 'data');
-
## Connexor
$tokens->add('Connexor', 'Sentences');
$output = decode_json( $tokens->to_json );
like($output->{foundries}, qr!connexor/sentences!, 'Foundries');
like($output->{layerInfos}, qr!cnx/s=spans!, 'layerInfos');
$first_token = join('||', @{$output->{data}->[0]});
-like($first_token, qr!<>:cnx/s:s#0-179\$<i>21!, 'data');
-
+like($first_token, qr!<>:cnx/s:s#0-179\$<i>21<b>2!, 'data');
$tokens->add('Connexor', 'Morpho');
$output = decode_json( $tokens->to_json );
@@ -220,7 +245,6 @@
$first_token = join('||', @{$output->{data}->[0]});
like($first_token, qr!<>:cnx/c:np#0-30\$<i>2!, 'data');
-
$tokens->add('Connexor', 'Syntax');
$output = decode_json( $tokens->to_json );
like($output->{foundries}, qr!connexor/syntax!, 'Foundries');
@@ -236,13 +260,55 @@
like($output->{layerInfos}, qr!mate/l=tokens!, 'layerInfos');
like($output->{layerInfos}, qr!mate/m=tokens!, 'layerInfos');
$first_token = join('||', @{$output->{data}->[0]});
-like($first_token, qr!---!, 'data');
+like($first_token, qr!mate/l:autobiographisch!, 'data');
+like($first_token, qr!mate/p:NN!, 'data');
+like($first_token, qr!mate/m:case:nom!, 'data');
+like($first_token, qr!mate/m:number:pl!, 'data');
+like($first_token, qr!mate/m:gender:\*!, 'data');
+
+
+fail("No test for mate dependency");
+
+## XIP
+$tokens->add('XIP', 'Sentences');
+$output = decode_json( $tokens->to_json );
+like($output->{foundries}, qr!xip/sentences!, 'Foundries');
+like($output->{layerInfos}, qr!xip/s=spans!, 'layerInfos');
+$first_token = join('||', @{$output->{data}->[0]});
+like($first_token, qr!<>:xip/s:s#0-179\$<i>21!, 'data');
+
+$tokens->add('XIP', 'Morpho');
+$output = decode_json( $tokens->to_json );
+like($output->{foundries}, qr!xip/morpho!, 'Foundries');
+like($output->{layerInfos}, qr!xip/l=tokens!, 'layerInfos');
+like($output->{layerInfos}, qr!xip/p=tokens!, 'layerInfos');
+$first_token = join('||', @{$output->{data}->[0]});
+like($first_token, qr!<>:xip/s:s#0-179\$<i>21!, 'data');
+
+
+# print timestr(timediff(Benchmark->new, $t));
+# 57.6802 wallclock secs (57.15 usr + 0.12 sys = 57.27 CPU)# $VAR1 = [
+# 55.026 wallclock secs (54.44 usr + 0.10 sys = 54.54 CPU)# $VAR1 = [
+# 55.3887 wallclock secs (54.62 usr + 0.17 sys = 54.79 CPU)# $VAR1 = [
+# 54.9578 wallclock secs (54.51 usr + 0.13 sys = 54.64 CPU)# $VAR1 = [
+# 53.7051 wallclock secs (53.42 usr + 0.11 sys = 53.53 CPU)# $VAR1 = [
+# 47.6566 wallclock secs (46.88 usr + 0.15 sys = 47.03 CPU)# $VAR1 = [
+# 47.2379 wallclock secs (46.60 usr + 0.11 sys = 46.71 CPU)# $VAR1 = [
+# 29.563 wallclock secs (29.37 usr + 0.10 sys = 29.47 CPU)# $VAR1 = [
+# 30.9321 wallclock secs (30.69 usr + 0.14 sys = 30.83 CPU)# $VAR1 = [
+
+$tokens->add('XIP', 'Constituency');
+$output = decode_json( $tokens->to_json );
+like($output->{foundries}, qr!xip/constituency!, 'Foundries');
+like($output->{layerInfos}, qr!xip/c=spans!, 'layerInfos');
+$first_token = join('||', @{$output->{data}->[0]});
+like($first_token, qr!<>:xip/c:NP#0-17\$<i>1<b>1!, 'data');
+like($first_token, qr!<>:xip/c:AP#0-17\$<i>1<b>2!, 'data');
+like($first_token, qr!<>:xip/c:ADJ#0-17\$<i>1<b>3!, 'data');
+like($first_token, qr!<>:xip/c:TOP#0-179\$<i>21<b>0!, 'data');
diag Dumper $output->{data}->[0];
-diag "Use token-ids in tokens!";
-diag "Sort tokens based on positions!";
-
done_testing;
__END__
diff --git a/t/real_goethe_xip_constituency.t b/t/real_goethe_xip_constituency.t
new file mode 100644
index 0000000..2281d1a
--- /dev/null
+++ b/t/real_goethe_xip_constituency.t
@@ -0,0 +1,48 @@
+#!/usr/bin/env perl
+# source ~/perl5/perlbrew/etc/bashrc
+# perlbrew switch perl-blead@korap
+use strict;
+use warnings;
+use Test::More;
+use Data::Dumper;
+use JSON::XS;
+
+use utf8;
+use lib 'lib', '../lib';
+
+use File::Basename 'dirname';
+use File::Spec::Functions 'catdir';
+
+use_ok('KorAP::Document');
+
+# GOE/AGA/03828
+my $path = catdir(dirname(__FILE__), 'GOE/AGA/03828');
+
+ok(my $doc = KorAP::Document->new( path => $path . '/' ), 'Load Korap::Document');
+ok($doc->parse, 'Parse document');
+
+# Tokenization
+use_ok('KorAP::Tokenizer');
+
+my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/);
+
+# Get tokenization
+my $tokens = KorAP::Tokenizer->new(
+ path => $doc->path,
+ doc => $doc,
+ foundry => $token_base_foundry,
+ layer => $token_base_layer,
+ name => 'tokens'
+);
+ok($tokens, 'Token Object is fine');
+ok($tokens->parse, 'Token parsing is fine');
+
+$tokens->add('XIP', 'Constituency');
+my $output = decode_json( $tokens->to_json );
+is($output->{foundries}, 'xip xip/constituency', 'Foundries');
+is($output->{layerInfos}, 'xip/c=rels', 'layerInfos');
+is($output->{layerInfos}, '', 'layerInfos');
+my $first_token = join('||', @{$output->{data}->[0]});
+#like($first_token, qr!<>:xip/s:s#0-179\$<i>21!, 'data');
+
+diag Dumper $output->{data}->[0];
diff --git a/t/sort_tokens.t b/t/sort_tokens.t
new file mode 100644
index 0000000..d698cac
--- /dev/null
+++ b/t/sort_tokens.t
@@ -0,0 +1,100 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use Test::More;
+use Mojo::ByteStream 'b';
+use utf8;
+use lib 'lib', '../lib';
+
+use_ok('KorAP::Field::MultiTermTokenStream');
+
+ok(my $mtt = KorAP::Field::MultiTermToken->new, 'New token');
+ok($mtt->o_start(0), 'Set start character offset');
+ok($mtt->o_end(5), 'Set end character offset');
+ok($mtt->add(term => '@:k=N',
+ payload =>'<s>9'), 'Add token');
+ok($mtt->add(term => 'a=N',
+ payload =>'<b>144'), 'Add token');
+ok($mtt->add(term => '<>:b=N',
+ o_start => 0,
+ o_end => 5,
+ p_end => 5), 'Add token');
+ok($mtt->add(term => 'c=N', payload => '<b>144'), 'Add token');
+ok($mtt->add(term => '<>:d=N',
+ o_start => 0,
+ o_end => 5,
+ p_end => 6,
+ payload => '<b>7'), 'Add token');
+ok($mtt->add(term => '@:j=N',
+ payload =>'<s>8'), 'Add token');
+ok($mtt->add(term => '<>:e=ADJ',
+ o_start => 0,
+ o_end => 5,
+ p_end => 6,
+ payload => '<b>6'), 'Add token');
+ok($mtt->add(term => '<>:f=N',
+ o_start => 0,
+ o_end => 5,
+ p_end => 6,
+ payload => '<b>5<b>122'), 'Add token');
+ok($mtt->add(term => 'g=N',
+ payload =>'<b>144'), 'Add token');
+ok($mtt->add(term => '@:h=N',
+ payload =>'<s>5'), 'Add token');
+ok($mtt->add(term => '@:i=N',
+ payload =>'<s>3'), 'Add token');
+
+is($mtt->to_string,'[(0-5)<>:b=N#0-5$<i>5|<>:f=N#0-5$<i>6<b>5<b>122|<>:e=ADJ#0-5$<i>6<b>6|<>:d=N#0-5$<i>6<b>7|@:i=N$<s>3|@:h=N$<s>5|@:j=N$<s>8|@:k=N$<s>9|a=N$<b>144|c=N$<b>144|g=N$<b>144]', 'Check string');
+
+ok($mtt = KorAP::Field::MultiTermToken->new, 'New token');
+ok($mtt->o_start(0), 'Set start character offset');
+ok($mtt->o_end(5), 'Set end character offset');
+
+# 2-7 to 2-4
+ok($mtt->add(term => '<:child-of', p_end => 7, payload => '<i>2<i>4<s>5<s>4<s>3'), 'New rel');
+
+# 2-4 to 3
+ok($mtt->add(term => '<:child-of', p_end => 4, payload => '<b>0<i>3<s>3<s>3<s>1'), 'New rel');
+
+# 2 to 2-4
+# <i>startright<i>endright<s>relation-id<s>left-id<s>right-id
+ok($mtt->add(term => '>:child-of', payload => '<i>2<i>4<s>2<s>1<s>3'), 'New rel');
+
+# 2-4 to 2-7
+ok($mtt->add(term => '>:child-of', p_end => 4, payload => '<i>2<i>7<s>1<s>3<s>4'), 'New rel');
+
+# 2-4 t0 4
+ok($mtt->add(term => '<:child-of', p_end => 4, payload => '<b>0<i>4<s>4<s>3<s>1'), 'New rel');
+
+# 2-7 to 1-7
+ok($mtt->add(term => '>:child-of', p_end => 7, payload => '<i>1<i>7<s>2<s>4<s>2'), 'New rel');
+
+# 2-7 to 4-7
+ok($mtt->add(term => '<:child-of', p_end => 7, payload => '<i>4<i>7<s>6<s>4<s>2'), 'New rel');
+
+# 2 to 3
+ok($mtt->add(term => '>:child-of', payload => '<i>3<s>2<s>4<s>2'), 'New rel');
+
+is($mtt->to_string, '[(0-5)>:child-of$<i>2<i>4<s>2<s>1<s>3|>:child-of$<i>3<s>2<s>4<s>2|>:child-of$<i>4<i>2<i>7<s>1<s>3<s>4|<:child-of$<i>4<b>0<i>3<s>3<s>3<s>1|<:child-of$<i>4<b>0<i>4<s>4<s>3<s>1|>:child-of$<i>7<i>1<i>7<s>2<s>4<s>2|<:child-of$<i>7<i>2<i>4<s>5<s>4<s>3|<:child-of$<i>7<i>4<i>7<s>6<s>4<s>2]', 'Check sorted relations');
+# 2 -> 2-4
+# >:child-of$<i>2<i>4<s>2<s>1<s>3
+# 2 -> 3
+# >:child-of$<i>3<s>2<s>4<s>2
+# 2-4 -> 2-7
+# >:child-of$<i>4<i>2<i>7<s>1<s>3<s>4
+# 2-4 -> 3
+# <:child-of$<i>4<b>0<i>3<s>3<s>3<s>1
+# 2-4 -> 4
+# <:child-of$<i>4<b>0<i>4<s>4<s>3<s>1
+# 2-7 -> 1-7
+# >:child-of$<i>7<i>1<i>7<s>2<s>4<s>2
+# 2-7 -> 2-4
+# <:child-of$<i>7<i>2<i>4<s>5<s>4<s>3
+# 2-7 -> 4-7
+# <:child-of$<i>7<i>4<i>7<s>6<s>4<s>2
+
+done_testing;
+
+
+__END__
+
diff --git a/t/xip/constituency.xml b/t/xip/constituency.xml
new file mode 100644
index 0000000..d36678f
--- /dev/null
+++ b/t/xip/constituency.xml
@@ -0,0 +1,534 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="span.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
+
+<layer xmlns="http://ids-mannheim.de/ns/KorAP" docid="GOE_AGA.03828" version="KorAP-0.4">
+ <spanList>
+ <span id="s1_n1" from="0" to="254">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">ROOT</f>
+ </fs>
+ <rel label="dominates" target="s1_n2"/>
+ </span>
+ <span id="s1_n2" from="0" to="254">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">S</f>
+ </fs>
+ <rel label="dominates" target="s1_n3"/>
+ <rel label="dominates" uri="morpho.xml#s1_n35"/>
+ <rel label="dominates" target="s1_n37"/>
+ <rel label="dominates" uri="morpho.xml#s1_n43"/>
+ <rel label="dominates" target="s1_n45"/>
+ <rel label="dominates" uri="morpho.xml#s1_n130"/>
+ </span>
+ <span id="s1_n3" from="0" to="58">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">PP</f>
+ </fs>
+ <rel label="dominates" target="s1_n4"/>
+ </span>
+ <span id="s1_n4" from="0" to="58">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">S</f>
+ </fs>
+ <rel label="dominates" target="s1_n5"/>
+ <rel label="dominates" target="s1_n23"/>
+ </span>
+ <span id="s1_n5" from="0" to="50">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">PP</f>
+ </fs>
+ <rel label="dominates" target="s1_n6"/>
+ </span>
+ <span id="s1_n6" from="0" to="50">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">S</f>
+ </fs>
+ <rel label="dominates" target="s1_n7"/>
+ <rel label="dominates" target="s1_n13"/>
+ </span>
+ <span id="s1_n7" from="0" to="17">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">NP</f>
+ </fs>
+ <rel label="dominates" target="s1_n8"/>
+ </span>
+ <span id="s1_n8" from="0" to="17">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">CNP</f>
+ </fs>
+ <rel label="dominates" target="s1_n9"/>
+ </span>
+ <span id="s1_n9" from="0" to="17">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">NP</f>
+ </fs>
+ <rel label="dominates" target="s1_n10"/>
+ </span>
+ <span id="s1_n10" from="0" to="17">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">AP</f>
+ </fs>
+ <rel label="dominates" uri="morpho.xml#s1_n11"/>
+ </span>
+ <span id="s1_n11" from="0" to="17">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">ADJA</f>
+ </fs>
+ </span>
+ <span id="s1_n13" from="18" to="50">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">NP</f>
+ </fs>
+ <rel label="dominates" target="s1_n14"/>
+ </span>
+ <span id="s1_n14" from="18" to="50">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">CNP</f>
+ </fs>
+ <rel label="dominates" target="s1_n15"/>
+ </span>
+ <span id="s1_n15" from="18" to="50">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">NP</f>
+ </fs>
+ <rel label="dominates" target="s1_n16"/>
+ <rel label="dominates" uri="morpho.xml#s1_n21"/>
+ </span>
+ <span id="s1_n16" from="18" to="48">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">AP</f>
+ </fs>
+ <rel label="dominates" uri="morpho.xml#s1_n17"/>
+ <rel label="dominates" uri="morpho.xml#s1_n19"/>
+ </span>
+ <span id="s1_n17" from="18" to="30">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">ADJA</f>
+ </fs>
+ </span>
+ <span id="s1_n19" from="31" to="48">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">NN</f>
+ </fs>
+ </span>
+ <span id="s1_n21" from="49" to="50">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">TRUNC</f>
+ </fs>
+ </span>
+ <span id="s1_n23" from="50" to="58">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">NP-SB</f>
+ </fs>
+ <rel label="dominates" target="s1_n24"/>
+ </span>
+ <span id="s1_n24" from="50" to="58">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">CS</f>
+ </fs>
+ <rel label="dominates" target="s1_n25"/>
+ </span>
+ <span id="s1_n25" from="50" to="58">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">S</f>
+ </fs>
+ <rel label="dominates" target="s1_n26"/>
+ <rel label="dominates" target="s1_n32"/>
+ </span>
+ <span id="s1_n26" from="50" to="52">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">CNP-SB</f>
+ </fs>
+ <rel label="dominates" target="s1_n27"/>
+ </span>
+ <span id="s1_n27" from="50" to="52">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">NP</f>
+ </fs>
+ <rel label="dominates" uri="morpho.xml#s1_n28"/>
+ <rel label="dominates" uri="morpho.xml#s1_n30"/>
+ </span>
+ <span id="s1_n28" from="50" to="51">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">CARD</f>
+ </fs>
+ </span>
+ <span id="s1_n30" from="51" to="52">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">TRUNC</f>
+ </fs>
+ </span>
+ <span id="s1_n32" from="53" to="58">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">NP-OA</f>
+ </fs>
+ <rel label="dominates" uri="morpho.xml#s1_n33"/>
+ </span>
+ <span id="s1_n33" from="53" to="58">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">ADV</f>
+ </fs>
+ </span>
+ <span id="s1_n35" from="59" to="60">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">XY</f>
+ </fs>
+ </span>
+ <span id="s1_n37" from="61" to="66">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">NP-SB</f>
+ </fs>
+ <rel label="dominates" target="s1_n38"/>
+ </span>
+ <span id="s1_n38" from="61" to="66">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">CS</f>
+ </fs>
+ <rel label="dominates" target="s1_n39"/>
+ </span>
+ <span id="s1_n39" from="61" to="66">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">S</f>
+ </fs>
+ <rel label="dominates" target="s1_n40"/>
+ </span>
+ <span id="s1_n40" from="61" to="66">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">VP</f>
+ </fs>
+ <rel label="dominates" uri="morpho.xml#s1_n41"/>
+ </span>
+ <span id="s1_n41" from="61" to="66">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">ADJD</f>
+ </fs>
+ </span>
+ <span id="s1_n43" from="66" to="67">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">$,</f>
+ </fs>
+ </span>
+ <span id="s1_n45" from="68" to="253">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">PP</f>
+ </fs>
+ <rel label="dominates" uri="morpho.xml#s1_n46"/>
+ <rel label="dominates" target="s1_n48"/>
+ <rel label="dominates" uri="morpho.xml#s1_n122"/>
+ <rel label="dominates" uri="morpho.xml#s1_n124"/>
+ <rel label="dominates" uri="morpho.xml#s1_n126"/>
+ <rel label="dominates" uri="morpho.xml#s1_n128"/>
+ </span>
+ <span id="s1_n46" from="68" to="72">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">APPR</f>
+ </fs>
+ </span>
+ <span id="s1_n48" from="73" to="218">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">AP</f>
+ </fs>
+ <rel label="dominates" target="s1_n49"/>
+ <rel label="dominates" uri="morpho.xml#s1_n120"/>
+ </span>
+ <span id="s1_n49" from="73" to="213">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">AVP</f>
+ </fs>
+ <rel label="dominates" target="s1_n50"/>
+ </span>
+ <span id="s1_n50" from="73" to="213">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">S</f>
+ </fs>
+ <rel label="dominates" target="s1_n51"/>
+ <rel label="dominates" uri="morpho.xml#s1_n69"/>
+ <rel label="dominates" target="s1_n71"/>
+ <rel label="dominates" target="s1_n117"/>
+ </span>
+ <span id="s1_n51" from="73" to="127">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">NP-SB</f>
+ </fs>
+ <rel label="dominates" target="s1_n52"/>
+ </span>
+ <span id="s1_n52" from="73" to="127">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">CNP</f>
+ </fs>
+ <rel label="dominates" uri="morpho.xml#s1_n53"/>
+ <rel label="dominates" uri="morpho.xml#s1_n55"/>
+ <rel label="dominates" target="s1_n57"/>
+ </span>
+ <span id="s1_n53" from="73" to="78">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">TRUNC</f>
+ </fs>
+ </span>
+ <span id="s1_n55" from="79" to="82">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">KON</f>
+ </fs>
+ </span>
+ <span id="s1_n57" from="83" to="127">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">NP</f>
+ </fs>
+ <rel label="dominates" target="s1_n58"/>
+ <rel label="dominates" uri="morpho.xml#s1_n65"/>
+ <rel label="dominates" uri="morpho.xml#s1_n67"/>
+ </span>
+ <span id="s1_n58" from="83" to="102">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">MPN</f>
+ </fs>
+ <rel label="dominates" uri="morpho.xml#s1_n59"/>
+ <rel label="dominates" uri="morpho.xml#s1_n61"/>
+ <rel label="dominates" uri="morpho.xml#s1_n63"/>
+ </span>
+ <span id="s1_n59" from="83" to="85">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">FM</f>
+ </fs>
+ </span>
+ <span id="s1_n61" from="86" to="88">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">FM</f>
+ </fs>
+ </span>
+ <span id="s1_n63" from="89" to="102">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">FM</f>
+ </fs>
+ </span>
+ <span id="s1_n65" from="103" to="113">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">ADJA</f>
+ </fs>
+ </span>
+ <span id="s1_n67" from="114" to="127">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">NN</f>
+ </fs>
+ </span>
+ <span id="s1_n69" from="128" to="133">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">VVFIN</f>
+ </fs>
+ </span>
+ <span id="s1_n71" from="134" to="208">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">AVP</f>
+ </fs>
+ <rel label="dominates" target="s1_n72"/>
+ </span>
+ <span id="s1_n72" from="134" to="208">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">S</f>
+ </fs>
+ <rel label="dominates" target="s1_n73"/>
+ <rel label="dominates" uri="morpho.xml#s1_n93"/>
+ <rel label="dominates" target="s1_n95"/>
+ </span>
+ <span id="s1_n73" from="134" to="179">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">NP-SB</f>
+ </fs>
+ <rel label="dominates" uri="morpho.xml#s1_n74"/>
+ <rel label="dominates" target="s1_n76"/>
+ </span>
+ <span id="s1_n74" from="134" to="137">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">ART</f>
+ </fs>
+ </span>
+ <span id="s1_n76" from="138" to="179">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">CNP</f>
+ </fs>
+ <rel label="dominates" uri="morpho.xml#s1_n77"/>
+ <rel label="dominates" uri="morpho.xml#s1_n79"/>
+ <rel label="dominates" target="s1_n81"/>
+ <rel label="dominates" uri="morpho.xml#s1_n91"/>
+ </span>
+ <span id="s1_n77" from="138" to="149">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">NN</f>
+ </fs>
+ </span>
+ <span id="s1_n79" from="150" to="153">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">KON</f>
+ </fs>
+ </span>
+ <span id="s1_n81" from="154" to="178">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">NP</f>
+ </fs>
+ <rel label="dominates" uri="morpho.xml#s1_n82"/>
+ <rel label="dominates" uri="morpho.xml#s1_n84"/>
+ <rel label="dominates" target="s1_n86"/>
+ </span>
+ <span id="s1_n82" from="154" to="157">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">ART</f>
+ </fs>
+ </span>
+ <span id="s1_n84" from="158" to="162">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">NN</f>
+ </fs>
+ </span>
+ <span id="s1_n86" from="163" to="178">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">NP</f>
+ </fs>
+ <rel label="dominates" uri="morpho.xml#s1_n87"/>
+ <rel label="dominates" uri="morpho.xml#s1_n89"/>
+ </span>
+ <span id="s1_n87" from="163" to="169">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">PPOSAT</f>
+ </fs>
+ </span>
+ <span id="s1_n89" from="170" to="178">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">NN</f>
+ </fs>
+ </span>
+ <span id="s1_n91" from="178" to="179">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">$.</f>
+ </fs>
+ </span>
+ <span id="s1_n93" from="180" to="183">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">VAFIN</f>
+ </fs>
+ </span>
+ <span id="s1_n95" from="184" to="208">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">NP</f>
+ </fs>
+ <rel label="dominates" uri="morpho.xml#s1_n96"/>
+ <rel label="dominates" target="s1_n98"/>
+ </span>
+ <span id="s1_n96" from="184" to="187">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">PIS</f>
+ </fs>
+ </span>
+ <span id="s1_n98" from="188" to="208">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">NP</f>
+ </fs>
+ <rel label="dominates" uri="morpho.xml#s1_n99"/>
+ <rel label="dominates" target="s1_n101"/>
+ </span>
+ <span id="s1_n99" from="188" to="191">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">ART</f>
+ </fs>
+ </span>
+ <span id="s1_n101" from="192" to="208">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">CNP</f>
+ </fs>
+ <rel label="dominates" target="s1_n102"/>
+ <rel label="dominates" uri="morpho.xml#s1_n107"/>
+ <rel label="dominates" target="s1_n109"/>
+ <rel label="dominates" target="s1_n112"/>
+ </span>
+ <span id="s1_n102" from="192" to="198">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">NP</f>
+ </fs>
+ <rel label="dominates" uri="morpho.xml#s1_n103"/>
+ <rel label="dominates" uri="morpho.xml#s1_n105"/>
+ </span>
+ <span id="s1_n103" from="192" to="196">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">APPR</f>
+ </fs>
+ </span>
+ <span id="s1_n105" from="197" to="198">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">XY</f>
+ </fs>
+ </span>
+ <span id="s1_n107" from="198" to="199">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">$,</f>
+ </fs>
+ </span>
+ <span id="s1_n109" from="200" to="202">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">AVP</f>
+ </fs>
+ <rel label="dominates" uri="morpho.xml#s1_n110"/>
+ </span>
+ <span id="s1_n110" from="200" to="202">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">ADV</f>
+ </fs>
+ </span>
+ <span id="s1_n112" from="203" to="208">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">NP</f>
+ </fs>
+ <rel label="dominates" uri="morpho.xml#s1_n113"/>
+ <rel label="dominates" uri="morpho.xml#s1_n115"/>
+ </span>
+ <span id="s1_n113" from="203" to="204">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">XY</f>
+ </fs>
+ </span>
+ <span id="s1_n115" from="205" to="208">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">XY</f>
+ </fs>
+ </span>
+ <span id="s1_n117" from="209" to="213">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">NP-DA</f>
+ </fs>
+ <rel label="dominates" uri="morpho.xml#s1_n118"/>
+ </span>
+ <span id="s1_n118" from="209" to="213">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">PRF</f>
+ </fs>
+ </span>
+ <span id="s1_n120" from="214" to="218">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">PIDAT</f>
+ </fs>
+ </span>
+ <span id="s1_n122" from="220" to="226">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">ADJA</f>
+ </fs>
+ </span>
+ <span id="s1_n124" from="227" to="240">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">ADJA</f>
+ </fs>
+ </span>
+ <span id="s1_n126" from="241" to="249">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">NN</f>
+ </fs>
+ </span>
+ <span id="s1_n128" from="250" to="253">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">XY</f>
+ </fs>
+ </span>
+ <span id="s1_n130" from="253" to="254">
+ <fs xmlns="http://www.tei-c.org/ns/1.0" type="node">
+ <f name="const">$.</f>
+ </fs>
+ </span>
+ </spanList>
+</layer>