Fix RWK paragraph handling
Change-Id: Ibbc9548b023c86f4a7a435900f32444a74187e5a
diff --git a/Changes b/Changes
index ed2e8ff..b0b9492 100644
--- a/Changes
+++ b/Changes
@@ -1,4 +1,4 @@
-0.41 2020-08-07
+0.41 2020-08-10
- Added support for RWK annotations.
- Improved DGD support.
- Fixed bug in RWK support that broke on
@@ -12,6 +12,7 @@
- Introduced add_span() method to MultiTermToken.
- Removed deprecated 'primary' flag.
- Removed deprecated 'pretty' flag.
+ - Fix RWK paragraph handling.
0.40 2020-03-03
- Fixed XIP parser.
diff --git a/lib/KorAP/XML/Annotation/CoreNLP/Sentences.pm b/lib/KorAP/XML/Annotation/CoreNLP/Sentences.pm
index 31ef18c..708cd91 100644
--- a/lib/KorAP/XML/Annotation/CoreNLP/Sentences.pm
+++ b/lib/KorAP/XML/Annotation/CoreNLP/Sentences.pm
@@ -24,7 +24,7 @@
sub layer_info {
- ['corenlp/s=spans'];
+ ['corenlp/s=spans'];
};
1;
diff --git a/lib/KorAP/XML/Annotation/DeReKo/Structure.pm b/lib/KorAP/XML/Annotation/DeReKo/Structure.pm
index 77ac223..2143a1d 100644
--- a/lib/KorAP/XML/Annotation/DeReKo/Structure.pm
+++ b/lib/KorAP/XML/Annotation/DeReKo/Structure.pm
@@ -104,7 +104,9 @@
foreach (@$attrs) {
# Add attributes
- $mt = $mtt->add_by_term('@:dereko/s:' . $_->{'-name'} . ($_->{'#text'} ? ':' . $_->{'#text'} : ''));
+ $mt = $mtt->add_by_term(
+ '@:dereko/s:' . $_->{'-name'} .
+ ($_->{'#text'} ? ':' . $_->{'#text'} : ''));
$mt->set_p_start($p_start);
$mt->set_pti(17);
$mt->set_payload($pl);
@@ -114,11 +116,12 @@
) or return;
if ($as_base) {
+ my $s = $$self->stream;
if (index($as_base, 'sentences') >= 0) {
- $$self->stream->add_meta('base/sentences', '<i>' . $sentences);
+ $s->add_meta('base/sentences', '<i>' . $sentences);
};
if (index($as_base, 'paragraphs') >= 0) {
- $$self->stream->add_meta('base/paragraphs', '<i>' . $paragraphs);
+ $s->add_meta('base/paragraphs', '<i>' . $paragraphs);
};
};
diff --git a/lib/KorAP/XML/Annotation/MDParser/Dependency.pm b/lib/KorAP/XML/Annotation/MDParser/Dependency.pm
index bb8b1ca..8b8f101 100644
--- a/lib/KorAP/XML/Annotation/MDParser/Dependency.pm
+++ b/lib/KorAP/XML/Annotation/MDParser/Dependency.pm
@@ -1,7 +1,5 @@
package KorAP::XML::Annotation::MDParser::Dependency;
use KorAP::XML::Annotation::Base;
-use strict;
-use warnings;
sub parse {
my $self = shift;
diff --git a/lib/KorAP/XML/Annotation/MarMoT/Morpho.pm b/lib/KorAP/XML/Annotation/MarMoT/Morpho.pm
index 026533f..eabebbb 100644
--- a/lib/KorAP/XML/Annotation/MarMoT/Morpho.pm
+++ b/lib/KorAP/XML/Annotation/MarMoT/Morpho.pm
@@ -1,6 +1,5 @@
package KorAP::XML::Annotation::MarMoT::Morpho;
use KorAP::XML::Annotation::Base;
-use Data::Dumper;
sub parse {
my $self = shift;
diff --git a/lib/KorAP/XML/Annotation/Mate/Dependency.pm b/lib/KorAP/XML/Annotation/Mate/Dependency.pm
index 97d69aa..3358121 100644
--- a/lib/KorAP/XML/Annotation/Mate/Dependency.pm
+++ b/lib/KorAP/XML/Annotation/Mate/Dependency.pm
@@ -1,8 +1,6 @@
package KorAP::XML::Annotation::Mate::Dependency;
use KorAP::XML::Annotation::Base;
-use strict;
-use warnings;
-our $NODE_LABEL = '&&&';
+# our $NODE_LABEL = '&&&';
sub parse {
my $self = shift;
diff --git a/lib/KorAP/XML/Annotation/OpenNLP/Sentences.pm b/lib/KorAP/XML/Annotation/OpenNLP/Sentences.pm
index 47aca86..9e3c58d 100644
--- a/lib/KorAP/XML/Annotation/OpenNLP/Sentences.pm
+++ b/lib/KorAP/XML/Annotation/OpenNLP/Sentences.pm
@@ -23,7 +23,7 @@
};
sub layer_info {
- ['opennlp/s=spans'];
+ ['opennlp/s=spans'];
};
1;
diff --git a/lib/KorAP/XML/Annotation/RWK/Morpho.pm b/lib/KorAP/XML/Annotation/RWK/Morpho.pm
index 42f169b..d8ac162 100644
--- a/lib/KorAP/XML/Annotation/RWK/Morpho.pm
+++ b/lib/KorAP/XML/Annotation/RWK/Morpho.pm
@@ -39,7 +39,8 @@
}
# ana tag
- elsif (($found = $f->{'#text'}) && $name =~ m/^(?:bc|(?:sub)?type|usage|person|pos|case|number|gender|tense|mood|degree)$/o) {
+ elsif (($found = $f->{'#text'}) &&
+ $name =~ m/^(?:bc|(?:sub)?type|usage|person|pos|case|number|gender|tense|mood|degree)$/o) {
$mtt->add_by_term('rwk/m:' . $name . ':' . $found);
};
};
diff --git a/lib/KorAP/XML/Annotation/RWK/Structure.pm b/lib/KorAP/XML/Annotation/RWK/Structure.pm
index 1173d13..b34cab5 100644
--- a/lib/KorAP/XML/Annotation/RWK/Structure.pm
+++ b/lib/KorAP/XML/Annotation/RWK/Structure.pm
@@ -3,13 +3,8 @@
sub parse {
my $self = shift;
+ my (@sentences, @paragraphs);
- my %milestones = (
- s => [],
- p => [],
- );
-
- my ($p_start, $o_start) = (0,0);
my ($last_p, $last_o) = (0,0);
$$self->add_spandata(
@@ -34,10 +29,10 @@
# Check only for anchors
if ($name eq 's-milestone') {
- push @{$milestones{s}}, [ $span->get_p_start, $span->get_o_start ];
+ push @sentences, [ $span->get_p_start, $span->get_o_start ];
}
elsif ($name eq 'p-milestone') {
- push @{$milestones{p}}, [ $span->get_p_start, $span->get_o_start ];
+ push @paragraphs, [ $span->get_p_start, $span->get_o_start ];
}
else {
$last_p = $span->get_p_start;
@@ -49,19 +44,24 @@
my ($sentences, $paragraphs) = (0, 0);
# Add final position
- push @{$milestones{s}}, [$last_p, $last_o];
- push @{$milestones{p}}, [$last_p, $last_o];
+ push @sentences, [$last_p, $last_o];
+ push @paragraphs, [$last_p, $last_o];
my $stream = $$self->stream;
- foreach my $type ('s', 'p') {
+ my %hash = (
+ s => \@sentences,
+ p => \@paragraphs
+ );
+ while (my ($type, $list) = each %hash) {
+ my ($p_start, $o_start) = (0,0);
# Sort and unique milestones
- @{$milestones{$type}} = sort {
+ @$list = sort {
$a->[0] <=> $b->[0]
- } @{$milestones{$type}};
+ } @$list;
# Iterate overs milestones
- foreach (@{$milestones{$type}}) {
+ foreach (@$list) {
if (($_->[0] == $p_start) || ($_->[1] == $o_start)) {
next;
@@ -77,7 +77,7 @@
};
$mtt = $stream->pos($p_start);
- }
+ };
# Add the base sentence
my $mt = $mtt->add_by_term('<>:base/s:' . $type);
@@ -107,6 +107,7 @@
};
};
+ # Set meta information about sentence count
return 1;
};
diff --git a/lib/KorAP/XML/Annotation/Sgbr/Lemma.pm b/lib/KorAP/XML/Annotation/Sgbr/Lemma.pm
index aa0248a..1f93b46 100644
--- a/lib/KorAP/XML/Annotation/Sgbr/Lemma.pm
+++ b/lib/KorAP/XML/Annotation/Sgbr/Lemma.pm
@@ -28,9 +28,6 @@
if (($f->{-name} eq 'lemma')
&& ($found = $f->{'#text'})) {
- # $found = b($found)->decode('latin-1')->encode->to_string;
- # warn $found;
-
unless ($first++) {
$mtt->add_by_term('sgbr/l:' . $found);
}
diff --git a/lib/KorAP/XML/Annotation/XIP/Constituency.pm b/lib/KorAP/XML/Annotation/XIP/Constituency.pm
index 5ee1259..6dec877 100644
--- a/lib/KorAP/XML/Annotation/XIP/Constituency.pm
+++ b/lib/KorAP/XML/Annotation/XIP/Constituency.pm
@@ -55,10 +55,6 @@
# It's definately not a root
$xip_const_noroot->insert($target);
-
- # if ($target =~ /^s2_n(?:36|58|59|60|40)$/) {
- # warn 'Probably not a root ' . $target . ' but ' . $span->id;
- # };
};
}
) or return;
@@ -115,14 +111,9 @@
$target = $1;
};
- # if ($span->id =~ /^s2_n(?:36|58|59|60|40)$/ && $target =~ /^s2_n(?:36|58|59|60|40)$/) {
- # warn 'B: ' . $span->id . ' points to ' . $target;
- # };
-
next unless $target;
my $subspan = delete $xip_const{$target};
- # warn "A-Forgot about $target: " . ($subspan ? 'yes' : 'no');
next unless $subspan;
diff --git a/lib/KorAP/XML/Annotation/XIP/Dependency.pm b/lib/KorAP/XML/Annotation/XIP/Dependency.pm
index 2323e00..5ed5912 100644
--- a/lib/KorAP/XML/Annotation/XIP/Dependency.pm
+++ b/lib/KorAP/XML/Annotation/XIP/Dependency.pm
@@ -47,7 +47,6 @@
my $rel_token = $tokens->token($from, $to) or next;
- # die $token->pos . ' -' . $label . '-> ' . $rel_token->pos;
$mt = $mtt->add_by_term('>:xip/d:' . $label);
$mt->set_payload('<i>' . $rel_token->get_pos);
);
@@ -56,7 +55,6 @@
->add_by_term('<:xip/d:' . $label);
$mt->set_payload('<i>' . $token->get_pos);
- # print $label,"\n";
}
) or return;
diff --git a/t/real/rwk.t b/t/real/rwk.t
index 420e493..7e5642d 100644
--- a/t/real/rwk.t
+++ b/t/real/rwk.t
@@ -218,9 +218,10 @@
$first = $output->{data}->{stream}->[0];
-is('-:base/paragraphs$<i>2',$first->[0]);
+is('-:base/paragraphs$<i>1',$first->[0]);
is('-:base/sentences$<i>21',$first->[1]);
is('-:tokens$<i>522',$first->[2]);
+
is('<>:base/s:s$<b>64<i>0<i>139<i>23<b>1',$first->[3]);
is('<>:base/s:p$<b>64<i>0<i>2631<i>449<b>1',$first->[4]);
is('<>:base/s:t$<b>64<i>0<i>3062<i>522<b>0',$first->[5]);
@@ -303,7 +304,7 @@
$first = $output->{data}->{stream}->[0];
-is('-:base/paragraphs$<i>2',$first->[0]);
+is('-:base/paragraphs$<i>1',$first->[0]);
is('-:base/sentences$<i>21',$first->[1]);
is('-:tokens$<i>522',$first->[2]);
is('<>:base/s:s$<b>64<i>0<i>139<i>23<b>1',$first->[3]);
@@ -375,7 +376,7 @@
$first = $output->{data}->{stream}->[0];
-is('-:base/paragraphs$<i>2',$first->[0]);
+is('-:base/paragraphs$<i>1',$first->[0]);
is('-:base/sentences$<i>33',$first->[1]);
is('-:tokens$<i>511',$first->[2]);
is('<>:base/s:s$<b>64<i>0<i>6<i>2<b>1',$first->[3]);
@@ -423,7 +424,7 @@
$first = $output->{data}->{stream}->[0];
-is('-:base/paragraphs$<i>2',$first->[0]);
+is('-:base/paragraphs$<i>1',$first->[0]);
is('-:base/sentences$<i>68',$first->[1]);
done_testing;