Added base-sentences and base-paragraphs options Change-Id: I695b65661d97785e75703207bfc83a316d0a4815

commit: 3741f8b0a0d6f8825ca8cf086338599cb53e6aa1 [log] [tgz]
author: Akron <nils@diewald-online.de> Wed Dec 21 19:55:21 2016 +0100
committer: Akron <nils@diewald-online.de> Wed Dec 21 19:55:21 2016 +0100
tree: 42a6652b5ee26e5f0ff35c91e4a2eaec6e85937b
parent: 53167fd2d9f52a82ff68b6a89a09b0074a65ca25 [diff]
diff --git a/Changes b/Changes
index a1ecbe9..e08e227 100644
--- a/Changes
+++ b/Changes

@@ -1,9 +1,12 @@
+0.24 2016-12-21
+        - Added --base-sentences and --base-paragraphs options
+
 0.23 2016-11-03
         - Added wildcard support for document extraction
         - Fixed archive iteration to not duplicate the first archive
         - Added parallel extraction for document sigles
         - Improved return value for existing files
-        - Don't warn on recursion in CoreNLP/Constituency.
+        - Don't warn on recursion in CoreNLP/Constituency
 
 0.22 2016-10-26
         - Added support for document extraction

diff --git a/Readme.pod b/Readme.pod
index 0c2272b..5fab72d 100644
--- a/Readme.pod
+++ b/Readme.pod

@@ -97,12 +97,31 @@
 
 Overwrite files that already exist.
 
-=item B<--token|-t> <foundry>[#<file>]
+=item B<--token|-t> <foundry>#<file>
 
 Define the default tokenization by specifying
 the name of the foundry and optionally the name
 of the layer-file. Defaults to C<OpenNLP#tokens>.
 
+
+=item B<--base-sentences|-bs> <foundry>#<layer>
+
+Define the layer for base sentences.
+If given, this will be used instead of using C<Base#Sentences>.
+Currently C<DeReKo#Structure> is the only additional layer supported.
+
+ Defaults to unset.
+
+
+=item B<--base-paragraphs|-bp> <foundry>#<layer>
+
+Define the layer for base paragraphs.
+If given, this will be used instead of using C<Base#Paragraphs>.
+Currently C<DeReKo#Structure> is the only additional layer supported.
+
+ Defaults to unset.
+
+
 =item B<--skip|-s> <foundry>[#<layer>]
 
 Skip specific annotations by specifying the foundry

diff --git a/lib/KorAP/XML/Annotation/Base/Paragraphs.pm b/lib/KorAP/XML/Annotation/Base/Paragraphs.pm
index ced09c2..235e2ff 100644
--- a/lib/KorAP/XML/Annotation/Base/Paragraphs.pm
+++ b/lib/KorAP/XML/Annotation/Base/Paragraphs.pm

@@ -13,12 +13,12 @@
       my $mtt = $stream->pos($span->p_start);
 
       $mtt->add(
-	term => '<>:base/s:p',
-	o_start => $span->o_start,
-	o_end => $span->o_end,
-	p_end => $span->p_end,
-	payload => '<b>1',
-	pti => 64
+        term => '<>:base/s:p',
+        o_start => $span->o_start,
+        o_end => $span->o_end,
+        p_end => $span->p_end,
+        payload => '<b>1',
+        pti => 64
       );
       $i++;
     }

diff --git a/lib/KorAP/XML/Annotation/Base/Sentences.pm b/lib/KorAP/XML/Annotation/Base/Sentences.pm
index 28c9434..1d66c8a 100644
--- a/lib/KorAP/XML/Annotation/Base/Sentences.pm
+++ b/lib/KorAP/XML/Annotation/Base/Sentences.pm

@@ -16,12 +16,12 @@
 
       $first = [$span->p_start, $span->o_start] unless defined $first;
       $mtt->add(
-	term => '<>:base/s:s',
-	o_start => $span->o_start,
-	o_end => $span->o_end,
-	p_end => $span->p_end,
-	payload => '<b>2',
-	pti => 64
+        term => '<>:base/s:s',
+        o_start => $span->o_start,
+        o_end => $span->o_end,
+        p_end => $span->p_end,
+        payload => '<b>2',
+        pti => 64
       );
       $last_p = $span->p_end;
       $last_o = $span->o_end;
@@ -29,15 +29,15 @@
     }
   ) or return;
 
-#  my $mt = $$self->stream->pos($first->[0]);
-#  $mt->add(
-#    term => '<>:base/s:t',
-#    o_start => $first->[1],
-#    p_end => $last_p,
-#    o_end => $last_o,
-#    payload => '<b>0',
-#    pti => 64
-#  );
+  #  my $mt = $$self->stream->pos($first->[0]);
+  #  $mt->add(
+  #    term => '<>:base/s:t',
+  #    o_start => $first->[1],
+  #    p_end => $last_p,
+  #    o_end => $last_o,
+  #    payload => '<b>0',
+  #    pti => 64
+  #  );
 
   $$self->stream->add_meta('base/sentences', '<i>' . $i);
 

diff --git a/lib/KorAP/XML/Annotation/DeReKo/Structure.pm b/lib/KorAP/XML/Annotation/DeReKo/Structure.pm
index 80a284e..ced5476 100644
--- a/lib/KorAP/XML/Annotation/DeReKo/Structure.pm
+++ b/lib/KorAP/XML/Annotation/DeReKo/Structure.pm

@@ -1,9 +1,10 @@
 package KorAP::XML::Annotation::DeReKo::Structure;
 use KorAP::XML::Annotation::Base;
-use Data::Dumper;
 
 sub parse {
   my $self = shift;
+  my $as_base = shift // 0;
+  my ($sentences, $paragraphs) = (0,0);
 
   $$self->add_spandata(
     foundry => 'struct',
@@ -21,10 +22,10 @@
 
       # Get attributes
       if (ref $feature eq 'ARRAY') {
-	$attrs = $feature->[1]->{fs}->{f};
-	$attrs = ref $attrs eq 'ARRAY' ? $attrs : [$attrs];
-	$feature = $feature->[0];
-	$tui = $stream->tui($p_start);
+        $attrs = $feature->[1]->{fs}->{f};
+        $attrs = ref $attrs eq 'ARRAY' ? $attrs : [$attrs];
+        $feature = $feature->[0];
+        $tui = $stream->tui($p_start);
       };
 
       # Get term label
@@ -37,42 +38,71 @@
 
       # Add structure
       my $mt = $mtt->add(
-	term    => '<>:dereko/s:' . $name,
-	o_start => $span->o_start,
-	o_end   => $span->o_end,
-	p_start => $p_start,
-	p_end   => $p_end,
-	pti     => $span->milestone ? 65 : 64,
+        term    => '<>:dereko/s:' . $name,
+        o_start => $span->o_start,
+        o_end   => $span->o_end,
+        p_start => $p_start,
+        p_end   => $p_end,
+        pti     => $span->milestone ? 65 : 64,
       );
 
       my $level = $span->hash->{'-l'};
       if ($level || $tui) {
-	my $pl;
-	$pl .= '<b>' . ($level ? $level - 1 : 0);
-	$pl .= '<s>' . $tui if $tui;
-	$mt->payload($pl);
+        my $pl;
+        $pl .= '<b>' . ($level ? $level - 1 : 0);
+        $pl .= '<s>' . $tui if $tui;
+        $mt->payload($pl);
+      };
+
+      # Use sentence and paragraph elements for base
+      if ($as_base && ($name eq 's' || $name eq 'p')) {
+
+        # Clone Multiterm
+        my $mt2 = $mt->clone;
+        $mt2->term('<>:base/s:' . $name);
+
+        if ($name eq 's' && index($as_base, 'sentences') >= 0) {
+          $mt2->payload('<b>2');
+          $sentences++;
+        }
+        elsif ($name eq 'p' && index($as_base, 'paragraphs') >= 0) {
+          $mt2->payload('<b>1');
+          $paragraphs++;
+        };
+
+        # Add to stream
+        $mtt->add($mt2);
       };
 
       # Add attributes
       if ($attrs) {
 
-	# Set a tui if attributes are set
-	foreach (@$attrs) {
+        # Set a tui if attributes are set
+        foreach (@$attrs) {
 
-	  # Add attributes
-	  $mtt->add(
-	    term =>
-	      '@:dereko/s:' . $_->{'-name'} . ':' . $_->{'#text'},
-	    p_start => $p_start,
-	    pti     => 17,
-	    payload => '<s>' . $tui .
-	      ($span->milestone ? '' : '<i>' . $p_end)
-	  );
-	};
+          # Add attributes
+          $mtt->add(
+            term =>
+              '@:dereko/s:' . $_->{'-name'} . ':' . $_->{'#text'},
+            p_start => $p_start,
+            pti     => 17,
+            payload => '<s>' . $tui .
+              ($span->milestone ? '' : '<i>' . $p_end)
+            );
+        };
       };
     }
   ) or return;
 
+  if ($as_base) {
+    if (index($as_base, 'sentences') >= 0) {
+      $$self->stream->add_meta('base/sentences', '<i>' . $sentences);
+    };
+    if (index($as_base, 'paragraphs') >= 0) {
+      $$self->stream->add_meta('base/paragraphs', '<i>' . $paragraphs);
+    };
+  };
+
   return 1;
 };
 

diff --git a/lib/KorAP/XML/Index/MultiTerm.pm b/lib/KorAP/XML/Index/MultiTerm.pm
index 87946e2..7e089de 100644
--- a/lib/KorAP/XML/Index/MultiTerm.pm
+++ b/lib/KorAP/XML/Index/MultiTerm.pm

@@ -128,8 +128,8 @@
       '<i>' . $_[0]->[4];
   };
 
-#  my $pl = $_[0]->[1] ?
-#    $_[0]->[1] - 1 : $_[0]->[0];
+  #  my $pl = $_[0]->[1] ?
+  #    $_[0]->[1] - 1 : $_[0]->[0];
 
   if ($_[0]->[2] || $_[0]->[0]) {
 
@@ -139,10 +139,10 @@
     };
     if ($_[0]->[0]) {
       if (index($_[0]->[0], '<') == 0) {
-	$pre .= $_[0]->[0];
+        $pre .= $_[0]->[0];
       }
       else {
-	$pre .= '<?>' . $_[0]->[0];
+        $pre .= '<?>' . $_[0]->[0];
       };
     };
   };
@@ -151,6 +151,11 @@
 };
 
 
+sub clone {
+  my $self = shift;
+  bless [@$self], __PACKAGE__;
+};
+
 sub to_string_2 {
   my $self = shift;
   my $string = $self->term;
@@ -166,10 +171,10 @@
     };
     if ($self->payload) {
       if (index($self->payload, '<') == 0) {
-	$string .= $self->payload;
+        $string .= $self->payload;
       }
       else {
-	$string .= '<?>' . $self->payload;
+        $string .= '<?>' . $self->payload;
       };
     };
   };

diff --git a/lib/KorAP/XML/Index/MultiTermToken.pm b/lib/KorAP/XML/Index/MultiTermToken.pm
index 9e78df8..3556422 100644
--- a/lib/KorAP/XML/Index/MultiTermToken.pm
+++ b/lib/KorAP/XML/Index/MultiTermToken.pm

@@ -1,5 +1,6 @@
 package KorAP::XML::Index::MultiTermToken;
 use KorAP::XML::Index::MultiTerm;
+use Scalar::Util qw/blessed/;
 use List::MoreUtils 'uniq';
 use Carp qw/carp croak/;
 use strict;
@@ -16,8 +17,9 @@
 
 sub add {
   my $self = shift;
+
   my $mt;
-  unless (ref $_[0] eq 'MultiTerm') {
+  unless (blessed $_[0]) {
     if (@_ == 1) {
       $mt = KorAP::XML::Index::MultiTerm->new(term => $_[0]);
     }

diff --git a/lib/KorAP/XML/Krill.pm b/lib/KorAP/XML/Krill.pm
index 968d3d9..1f5fad8 100644
--- a/lib/KorAP/XML/Krill.pm
+++ b/lib/KorAP/XML/Krill.pm

@@ -15,7 +15,7 @@
 use Data::Dumper;
 use File::Spec::Functions qw/catdir catfile catpath splitdir splitpath rel2abs/;
 
-our $VERSION = '0.23';
+our $VERSION = '0.24';
 
 has 'path';
 has [qw/text_sigle doc_sigle corpus_sigle/];

diff --git a/script/korapxml2krill b/script/korapxml2krill
index 27eb360..7974c76 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill

@@ -73,12 +73,15 @@
 # 2016/10/24
 # - Added support for document extraction
 #
-# 1016/10/27
+# 2016/10/27
 # - Added wildcard support for document extraction
 #
+# 2016/12/21
+# - added support for base-sentences and base-tokenizations
+#
 # ----------------------------------------------------------
 
-our $LAST_CHANGE = '2016/10/27';
+our $LAST_CHANGE = '2016/12/21';
 our $LOCAL = $FindBin::Bin;
 our $VERSION_MSG = <<"VERSION";
 Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
@@ -101,6 +104,8 @@
   'overwrite|w' => \(my $overwrite),
   'meta|m=s'    => \(my $meta),
   'token|t=s'   => \(my $token_base = 'OpenNLP#tokens'),
+  'base-sentences|bs=s' => \(my $base_sentences = ''),
+  'base-paragraphs|bp=s' => \(my $base_paragraphs = ''),
   'gzip|z'      => \(my $gzip),
   'skip|s=s'    => \@skip,
   'sigle|sg=s'  => \@sigle,
@@ -130,6 +135,9 @@
   }
 );
 
+$base_sentences = lc $base_sentences;
+$base_paragraphs = lc $base_paragraphs;
+
 my %ERROR_HASH = (
   -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
   -verbose  => 99,
@@ -158,8 +166,8 @@
 $skip{lc($_)} = 1 foreach @skip;
 
 my @layers;
-push(@layers, ['Base', 'Sentences']);
-push(@layers, ['Base', 'Paragraphs']);
+push(@layers, ['Base', 'Sentences']) unless $base_sentences;
+push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
 
 # Connexor
 push(@layers, ['Connexor', 'Morpho']);
@@ -173,8 +181,20 @@
 push(@layers, ['CoreNLP', 'Morpho']);
 push(@layers, ['CoreNLP', 'Constituency']);
 
+
 # DeReKo
-push(@layers, ['DeReKo', 'Structure']);
+if ($base_sentences eq 'dereko#structure' && $base_paragraphs eq 'dereko#structure') {
+  push(@layers, ['DeReKo', 'Structure', 'base-sentences-paragraphs']);
+}
+elsif ($base_sentences eq 'dereko#structure') {
+  push(@layers, ['DeReKo', 'Structure', 'base-sentences']);
+}
+elsif ($base_paragraphs eq 'dereko#structure') {
+  push(@layers, ['DeReKo', 'Structure', 'base-paragraphs']);
+}
+else {
+  push(@layers, ['DeReKo', 'Structure']);
+};
 
 # Glemm
 push(@layers, ['Glemm', 'Morpho']);
@@ -670,12 +690,31 @@
 
 Overwrite files that already exist.
 
-=item B<--token|-t> <foundry>[#<file>]
+=item B<--token|-t> <foundry>#<file>
 
 Define the default tokenization by specifying
 the name of the foundry and optionally the name
 of the layer-file. Defaults to C<OpenNLP#tokens>.
 
+
+=item B<--base-sentences|-bs> <foundry>#<layer>
+
+Define the layer for base sentences.
+If given, this will be used instead of using C<Base#Sentences>.
+Currently C<DeReKo#Structure> is the only additional layer supported.
+
+ Defaults to unset.
+
+
+=item B<--base-paragraphs|-bp> <foundry>#<layer>
+
+Define the layer for base paragraphs.
+If given, this will be used instead of using C<Base#Paragraphs>.
+Currently C<DeReKo#Structure> is the only additional layer supported.
+
+ Defaults to unset.
+
+
 =item B<--skip|-s> <foundry>[#<layer>]
 
 Skip specific annotations by specifying the foundry

diff --git a/t/real/goethe-2.t b/t/real/goethe-2.t
index 2b4f720..d197617 100644
--- a/t/real/goethe-2.t
+++ b/t/real/goethe-2.t

@@ -128,18 +128,31 @@
 ok(!exists $output->{docEditor}, 'Correct Text Type');
 
 ## Base
-$tokens->add('DeReKo', 'Structure');
+$tokens->add('DeReKo', 'Structure', 'base_sentences_paragraphs');
 
 $output = $tokens->to_data;
 
-is($output->{data}->{foundries}, 'dereko dereko/structure', 'Foundries');
+is($output->{data}->{foundries}, 'dereko dereko/structure dereko/structure/base_sentences_paragraphs', 'Foundries');
 is($output->{data}->{layerInfos}, 'dereko/s=spans', 'layerInfos');
 my $first_token = join('||', @{$output->{data}->{stream}->[0]});
 like($first_token, qr/s:Autobiographische/, 'data');
 like($first_token, qr/_0\$<i>0<i>17/, 'data');
 like($first_token, qr!<>:dereko/s:s\$<b>64<i>0<i>30<i>2<b>4!, 'data');
-#like($first_token, qr!<>:base/s:s\$<b>64<i>0<i>30<i>2<b>4!, 'data');
-#like($first_token, qr!<>:base\/s:t\$<b>64<i>0<i>35199<i>5226<b>0!, 'data');
+like($first_token, qr!<>:base\/s:t\$<b>64<i>0<i>35250<i>5233<b>0!, 'data');
+like($first_token, qr!<>:base/s:s\$<b>64<i>0<i>30<i>2<b>2!, 'data');
+like($first_token, qr!-:base\/paragraphs\$\<i\>14!, 'data');
+like($first_token, qr!-:base\/sentences\$\<i\>215!, 'data');
+
+# Check paragraph
+$first_token = join('||', @{$output->{data}->{stream}->[4]});
+like($first_token, qr/s:immer/, 'data');
+like($first_token, qr!<>:base\/s:s\$<b>64<i>53<i>254<i>32<b>2!, 'data');
+like($first_token, qr!<>:dereko\/s:s\$<b>64<i>53<i>254<i>32<b>5<s>1!, 'data');
+like($first_token, qr!<>:base/s:p\$\<b>64<i>53<i>3299<i>504<b>1!, 'data');
+like($first_token, qr!<>:dereko/s:p\$\<b>64<i>53<i>3299<i>504<b>4!, 'data');
+
+$first_token = join('||', @{$output->{data}->{stream}->[180]});
+like($first_token, qr/i:geschäften/, 'data');
 
 done_testing;
 __END__

diff --git a/t/script/base.t b/t/script/base.t
new file mode 100644
index 0000000..bd2d8e5
--- /dev/null
+++ b/t/script/base.t

@@ -0,0 +1,69 @@
+#/usr/bin/env perl
+use strict;
+use warnings;
+use File::Basename 'dirname';
+use File::Spec::Functions qw/catdir catfile/;
+use File::Temp qw/ :POSIX /;
+use Mojo::Util qw/slurp/;
+use Mojo::JSON qw/decode_json/;
+use IO::Uncompress::Gunzip;
+use Test::More;
+use Test::Output;
+use Data::Dumper;
+use utf8;
+
+my $f = dirname(__FILE__);
+my $script = catfile($f, '..', '..', 'script', 'korapxml2krill');
+
+my $input = catdir($f, '..', 'corpus', 'GOE2', 'AGA', '03828');
+ok(-d $input, 'Input directory found');
+
+my $output = tmpnam();
+
+ok(!-f $output, 'Output does not exist');
+
+my $call = join(
+  ' ',
+  'perl', $script,
+  '--input' => $input,
+  '--output' => $output,
+  '-t' => 'Base#tokens_aggr',
+  '-bs' => 'DeReKo#Structure',
+  '-bp' => 'DeReKo#Structure',
+  '-l' => 'INFO'
+);
+
+# Test without compression
+stderr_like(
+  sub {
+    system($call);
+  },
+  qr!The code took!,
+  $call
+);
+
+ok(-f $output, 'Output does exist');
+ok((my $file = slurp $output), 'Slurp data');
+ok((my $json = decode_json $file), 'decode json');
+is($json->{textType}, 'Autobiographie', 'text type');
+is($json->{title}, 'Autobiographische Einzelheiten', 'Title');
+is($json->{data}->{tokenSource}, 'base#tokens_aggr', 'Title');
+is($json->{data}->{foundries}, 'dereko dereko/structure dereko/structure/base-sentences-paragraphs', 'Foundries');
+my $stream = $json->{data}->{stream};
+my $token = $stream->[0];
+is($token->[0], '-:base/paragraphs$<i>14', 'Paragraphs');
+is($token->[1], '-:base/sentences$<i>215', 'Sentences');
+
+is($token->[5], '<>:base/s:s$<b>64<i>0<i>30<i>2<b>2', 'struct');
+is($token->[7], '<>:dereko/s:s$<b>64<i>0<i>30<i>2<b>4', 'struct');
+is($token->[8], '<>:base/s:t$<b>64<i>0<i>35250<i>5238<b>0', 'struct');
+
+$token = $stream->[4];
+is($token->[0], '<>:base/s:s$<b>64<i>53<i>254<i>32<b>2', 'struct');
+is($token->[1], '<>:dereko/s:s$<b>64<i>53<i>254<i>32<b>5<s>1', 'struct');
+is($token->[2], '<>:base/s:p$<b>64<i>53<i>3299<i>504<b>1', 'struct');
+is($token->[3], '<>:dereko/s:p$<b>64<i>53<i>3299<i>504<b>4', 'struct');
+
+done_testing;
+
+__END__
commit	3741f8b0a0d6f8825ca8cf086338599cb53e6aa1	[log] [tgz]
author	Akron <nils@diewald-online.de>	Wed Dec 21 19:55:21 2016 +0100
committer	Akron <nils@diewald-online.de>	Wed Dec 21 19:55:21 2016 +0100
tree	42a6652b5ee26e5f0ff35c91e4a2eaec6e85937b
parent	53167fd2d9f52a82ff68b6a89a09b0074a65ca25 [diff]