Fix RWK paragraph handling Change-Id: Ibbc9548b023c86f4a7a435900f32444a74187e5a

commit: 56deacb172086a48f9ef4bb7e80c9b7ed15c5915 [log] [tgz]
author: Akron <nils@diewald-online.de> Mon Aug 10 10:03:55 2020 +0200
committer: Akron <nils@diewald-online.de> Mon Aug 10 10:57:41 2020 +0200
tree: 882a0f9f6f7f3a20b22447dc11d2898b707f57a4
parent: 5c602cbd069de12cbaa229ace9cf73132e3aac30 [diff]
diff --git a/Changes b/Changes
index ed2e8ff..b0b9492 100644
--- a/Changes
+++ b/Changes

@@ -1,4 +1,4 @@
-0.41 2020-08-07
+0.41 2020-08-10
         - Added support for RWK annotations.
         - Improved DGD support.
         - Fixed bug in RWK support that broke on
@@ -12,6 +12,7 @@
         - Introduced add_span() method to MultiTermToken.
         - Removed deprecated 'primary' flag.
         - Removed deprecated 'pretty' flag.
+        - Fix RWK paragraph handling.
 
 0.40 2020-03-03
         - Fixed XIP parser.

diff --git a/lib/KorAP/XML/Annotation/CoreNLP/Sentences.pm b/lib/KorAP/XML/Annotation/CoreNLP/Sentences.pm
index 31ef18c..708cd91 100644
--- a/lib/KorAP/XML/Annotation/CoreNLP/Sentences.pm
+++ b/lib/KorAP/XML/Annotation/CoreNLP/Sentences.pm

@@ -24,7 +24,7 @@
 
 
 sub layer_info {
-    ['corenlp/s=spans'];
+  ['corenlp/s=spans'];
 };
 
 1;

diff --git a/lib/KorAP/XML/Annotation/DeReKo/Structure.pm b/lib/KorAP/XML/Annotation/DeReKo/Structure.pm
index 77ac223..2143a1d 100644
--- a/lib/KorAP/XML/Annotation/DeReKo/Structure.pm
+++ b/lib/KorAP/XML/Annotation/DeReKo/Structure.pm

@@ -104,7 +104,9 @@
         foreach (@$attrs) {
 
           # Add attributes
-          $mt = $mtt->add_by_term('@:dereko/s:' . $_->{'-name'} . ($_->{'#text'} ? ':' . $_->{'#text'} : ''));
+          $mt = $mtt->add_by_term(
+            '@:dereko/s:' . $_->{'-name'} .
+              ($_->{'#text'} ? ':' . $_->{'#text'} : ''));
           $mt->set_p_start($p_start);
           $mt->set_pti(17);
           $mt->set_payload($pl);
@@ -114,11 +116,12 @@
   ) or return;
 
   if ($as_base) {
+    my $s = $$self->stream;
     if (index($as_base, 'sentences') >= 0) {
-      $$self->stream->add_meta('base/sentences', '<i>' . $sentences);
+      $s->add_meta('base/sentences', '<i>' . $sentences);
     };
     if (index($as_base, 'paragraphs') >= 0) {
-      $$self->stream->add_meta('base/paragraphs', '<i>' . $paragraphs);
+      $s->add_meta('base/paragraphs', '<i>' . $paragraphs);
     };
   };
 

diff --git a/lib/KorAP/XML/Annotation/MDParser/Dependency.pm b/lib/KorAP/XML/Annotation/MDParser/Dependency.pm
index bb8b1ca..8b8f101 100644
--- a/lib/KorAP/XML/Annotation/MDParser/Dependency.pm
+++ b/lib/KorAP/XML/Annotation/MDParser/Dependency.pm

@@ -1,7 +1,5 @@
 package KorAP::XML::Annotation::MDParser::Dependency;
 use KorAP::XML::Annotation::Base;
-use strict;
-use warnings;
 
 sub parse {
   my $self = shift;

diff --git a/lib/KorAP/XML/Annotation/MarMoT/Morpho.pm b/lib/KorAP/XML/Annotation/MarMoT/Morpho.pm
index 026533f..eabebbb 100644
--- a/lib/KorAP/XML/Annotation/MarMoT/Morpho.pm
+++ b/lib/KorAP/XML/Annotation/MarMoT/Morpho.pm

@@ -1,6 +1,5 @@
 package KorAP::XML::Annotation::MarMoT::Morpho;
 use KorAP::XML::Annotation::Base;
-use Data::Dumper;
 
 sub parse {
   my $self = shift;

diff --git a/lib/KorAP/XML/Annotation/Mate/Dependency.pm b/lib/KorAP/XML/Annotation/Mate/Dependency.pm
index 97d69aa..3358121 100644
--- a/lib/KorAP/XML/Annotation/Mate/Dependency.pm
+++ b/lib/KorAP/XML/Annotation/Mate/Dependency.pm

@@ -1,8 +1,6 @@
 package KorAP::XML::Annotation::Mate::Dependency;
 use KorAP::XML::Annotation::Base;
-use strict;
-use warnings;
-our $NODE_LABEL = '&&&';
+# our $NODE_LABEL = '&&&';
 
 sub parse {
   my $self = shift;

diff --git a/lib/KorAP/XML/Annotation/OpenNLP/Sentences.pm b/lib/KorAP/XML/Annotation/OpenNLP/Sentences.pm
index 47aca86..9e3c58d 100644
--- a/lib/KorAP/XML/Annotation/OpenNLP/Sentences.pm
+++ b/lib/KorAP/XML/Annotation/OpenNLP/Sentences.pm

@@ -23,7 +23,7 @@
 };
 
 sub layer_info {
-    ['opennlp/s=spans'];
+  ['opennlp/s=spans'];
 };
 
 1;

diff --git a/lib/KorAP/XML/Annotation/RWK/Morpho.pm b/lib/KorAP/XML/Annotation/RWK/Morpho.pm
index 42f169b..d8ac162 100644
--- a/lib/KorAP/XML/Annotation/RWK/Morpho.pm
+++ b/lib/KorAP/XML/Annotation/RWK/Morpho.pm

@@ -39,7 +39,8 @@
         }
 
         # ana tag
-        elsif (($found = $f->{'#text'}) && $name =~ m/^(?:bc|(?:sub)?type|usage|person|pos|case|number|gender|tense|mood|degree)$/o) {
+        elsif (($found = $f->{'#text'}) &&
+                 $name =~ m/^(?:bc|(?:sub)?type|usage|person|pos|case|number|gender|tense|mood|degree)$/o) {
           $mtt->add_by_term('rwk/m:' . $name . ':' . $found);
         };
       };

diff --git a/lib/KorAP/XML/Annotation/RWK/Structure.pm b/lib/KorAP/XML/Annotation/RWK/Structure.pm
index 1173d13..b34cab5 100644
--- a/lib/KorAP/XML/Annotation/RWK/Structure.pm
+++ b/lib/KorAP/XML/Annotation/RWK/Structure.pm

@@ -3,13 +3,8 @@
 
 sub parse {
   my $self = shift;
+  my (@sentences, @paragraphs);
 
-  my %milestones = (
-    s => [],
-    p => [],
-  );
-
-  my ($p_start, $o_start) = (0,0);
   my ($last_p, $last_o) = (0,0);
 
   $$self->add_spandata(
@@ -34,10 +29,10 @@
 
       # Check only for anchors
       if ($name eq 's-milestone') {
-        push @{$milestones{s}}, [ $span->get_p_start, $span->get_o_start ];
+        push @sentences, [ $span->get_p_start, $span->get_o_start ];
       }
       elsif ($name eq 'p-milestone') {
-        push @{$milestones{p}}, [ $span->get_p_start, $span->get_o_start ];
+        push @paragraphs, [ $span->get_p_start, $span->get_o_start ];
       }
       else {
         $last_p = $span->get_p_start;
@@ -49,19 +44,24 @@
   my ($sentences, $paragraphs) = (0, 0);
 
   # Add final position
-  push @{$milestones{s}}, [$last_p, $last_o];
-  push @{$milestones{p}}, [$last_p, $last_o];
+  push @sentences, [$last_p, $last_o];
+  push @paragraphs, [$last_p, $last_o];
 
   my $stream = $$self->stream;
-  foreach my $type ('s', 'p') {
+  my %hash = (
+    s => \@sentences,
+    p => \@paragraphs
+  );
+  while (my ($type, $list) = each %hash) {
+    my ($p_start, $o_start) = (0,0);
 
     # Sort and unique milestones
-    @{$milestones{$type}} = sort {
+    @$list = sort {
       $a->[0] <=> $b->[0]
-    } @{$milestones{$type}};
+    } @$list;
 
     # Iterate overs milestones
-    foreach (@{$milestones{$type}}) {
+    foreach (@$list) {
 
       if (($_->[0] == $p_start) || ($_->[1] == $o_start)) {
         next;
@@ -77,7 +77,7 @@
         };
 
         $mtt = $stream->pos($p_start);
-      }
+      };
 
       # Add the base sentence
       my $mt = $mtt->add_by_term('<>:base/s:' . $type);
@@ -107,6 +107,7 @@
     };
   };
 
+  # Set meta information about sentence count
   return 1;
 };
 

diff --git a/lib/KorAP/XML/Annotation/Sgbr/Lemma.pm b/lib/KorAP/XML/Annotation/Sgbr/Lemma.pm
index aa0248a..1f93b46 100644
--- a/lib/KorAP/XML/Annotation/Sgbr/Lemma.pm
+++ b/lib/KorAP/XML/Annotation/Sgbr/Lemma.pm

@@ -28,9 +28,6 @@
         if (($f->{-name} eq 'lemma')
               && ($found = $f->{'#text'})) {
 
-          # $found = b($found)->decode('latin-1')->encode->to_string;
-          # warn $found;
-
           unless ($first++) {
             $mtt->add_by_term('sgbr/l:' . $found);
           }

diff --git a/lib/KorAP/XML/Annotation/XIP/Constituency.pm b/lib/KorAP/XML/Annotation/XIP/Constituency.pm
index 5ee1259..6dec877 100644
--- a/lib/KorAP/XML/Annotation/XIP/Constituency.pm
+++ b/lib/KorAP/XML/Annotation/XIP/Constituency.pm

@@ -55,10 +55,6 @@
 
         # It's definately not a root
         $xip_const_noroot->insert($target);
-
-        # if ($target =~ /^s2_n(?:36|58|59|60|40)$/) {
-        #   warn 'Probably not a root ' . $target . ' but ' . $span->id;
-        # };
       };
     }
   ) or return;
@@ -115,14 +111,9 @@
         $target = $1;
       };
 
-      # if ($span->id =~ /^s2_n(?:36|58|59|60|40)$/ && $target =~ /^s2_n(?:36|58|59|60|40)$/) {
-      # warn 'B: ' . $span->id . ' points to ' . $target;
-      # };
-
       next unless $target;
 
       my $subspan = delete $xip_const{$target};
-      # warn "A-Forgot about $target: " . ($subspan ? 'yes' : 'no');
 
       next unless $subspan;
 

diff --git a/lib/KorAP/XML/Annotation/XIP/Dependency.pm b/lib/KorAP/XML/Annotation/XIP/Dependency.pm
index 2323e00..5ed5912 100644
--- a/lib/KorAP/XML/Annotation/XIP/Dependency.pm
+++ b/lib/KorAP/XML/Annotation/XIP/Dependency.pm

@@ -47,7 +47,6 @@
 
         my $rel_token = $tokens->token($from, $to) or next;
 
-        # die $token->pos . ' -' . $label . '-> ' . $rel_token->pos;
         $mt = $mtt->add_by_term('>:xip/d:' . $label);
         $mt->set_payload('<i>' . $rel_token->get_pos);
       );
@@ -56,7 +55,6 @@
         ->add_by_term('<:xip/d:' . $label);
       $mt->set_payload('<i>' . $token->get_pos);
 
-    #	print $label,"\n";
     }
   ) or return;
 

diff --git a/t/real/rwk.t b/t/real/rwk.t
index 420e493..7e5642d 100644
--- a/t/real/rwk.t
+++ b/t/real/rwk.t

@@ -218,9 +218,10 @@
 
 $first = $output->{data}->{stream}->[0];
 
-is('-:base/paragraphs$<i>2',$first->[0]);
+is('-:base/paragraphs$<i>1',$first->[0]);
 is('-:base/sentences$<i>21',$first->[1]);
 is('-:tokens$<i>522',$first->[2]);
+
 is('<>:base/s:s$<b>64<i>0<i>139<i>23<b>1',$first->[3]);
 is('<>:base/s:p$<b>64<i>0<i>2631<i>449<b>1',$first->[4]);
 is('<>:base/s:t$<b>64<i>0<i>3062<i>522<b>0',$first->[5]);
@@ -303,7 +304,7 @@
 
 $first = $output->{data}->{stream}->[0];
 
-is('-:base/paragraphs$<i>2',$first->[0]);
+is('-:base/paragraphs$<i>1',$first->[0]);
 is('-:base/sentences$<i>21',$first->[1]);
 is('-:tokens$<i>522',$first->[2]);
 is('<>:base/s:s$<b>64<i>0<i>139<i>23<b>1',$first->[3]);
@@ -375,7 +376,7 @@
 
 $first = $output->{data}->{stream}->[0];
 
-is('-:base/paragraphs$<i>2',$first->[0]);
+is('-:base/paragraphs$<i>1',$first->[0]);
 is('-:base/sentences$<i>33',$first->[1]);
 is('-:tokens$<i>511',$first->[2]);
 is('<>:base/s:s$<b>64<i>0<i>6<i>2<b>1',$first->[3]);
@@ -423,7 +424,7 @@
 
 $first = $output->{data}->{stream}->[0];
 
-is('-:base/paragraphs$<i>2',$first->[0]);
+is('-:base/paragraphs$<i>1',$first->[0]);
 is('-:base/sentences$<i>68',$first->[1]);
 
 done_testing;
commit	56deacb172086a48f9ef4bb7e80c9b7ed15c5915	[log] [tgz]
author	Akron <nils@diewald-online.de>	Mon Aug 10 10:03:55 2020 +0200
committer	Akron <nils@diewald-online.de>	Mon Aug 10 10:57:41 2020 +0200
tree	882a0f9f6f7f3a20b22447dc11d2898b707f57a4
parent	5c602cbd069de12cbaa229ace9cf73132e3aac30 [diff]