Added base-sentences and base-paragraphs options
Change-Id: I695b65661d97785e75703207bfc83a316d0a4815
diff --git a/Changes b/Changes
index a1ecbe9..e08e227 100644
--- a/Changes
+++ b/Changes
@@ -1,9 +1,12 @@
+0.24 2016-12-21
+ - Added --base-sentences and --base-paragraphs options
+
0.23 2016-11-03
- Added wildcard support for document extraction
- Fixed archive iteration to not duplicate the first archive
- Added parallel extraction for document sigles
- Improved return value for existing files
- - Don't warn on recursion in CoreNLP/Constituency.
+ - Don't warn on recursion in CoreNLP/Constituency
0.22 2016-10-26
- Added support for document extraction
diff --git a/Readme.pod b/Readme.pod
index 0c2272b..5fab72d 100644
--- a/Readme.pod
+++ b/Readme.pod
@@ -97,12 +97,31 @@
Overwrite files that already exist.
-=item B<--token|-t> <foundry>[#<file>]
+=item B<--token|-t> <foundry>#<file>
Define the default tokenization by specifying
the name of the foundry and optionally the name
of the layer-file. Defaults to C<OpenNLP#tokens>.
+
+=item B<--base-sentences|-bs> <foundry>#<layer>
+
+Define the layer for base sentences.
+If given, this will be used instead of using C<Base#Sentences>.
+Currently C<DeReKo#Structure> is the only additional layer supported.
+
+ Defaults to unset.
+
+
+=item B<--base-paragraphs|-bp> <foundry>#<layer>
+
+Define the layer for base paragraphs.
+If given, this will be used instead of using C<Base#Paragraphs>.
+Currently C<DeReKo#Structure> is the only additional layer supported.
+
+ Defaults to unset.
+
+
=item B<--skip|-s> <foundry>[#<layer>]
Skip specific annotations by specifying the foundry
diff --git a/lib/KorAP/XML/Annotation/Base/Paragraphs.pm b/lib/KorAP/XML/Annotation/Base/Paragraphs.pm
index ced09c2..235e2ff 100644
--- a/lib/KorAP/XML/Annotation/Base/Paragraphs.pm
+++ b/lib/KorAP/XML/Annotation/Base/Paragraphs.pm
@@ -13,12 +13,12 @@
my $mtt = $stream->pos($span->p_start);
$mtt->add(
- term => '<>:base/s:p',
- o_start => $span->o_start,
- o_end => $span->o_end,
- p_end => $span->p_end,
- payload => '<b>1',
- pti => 64
+ term => '<>:base/s:p',
+ o_start => $span->o_start,
+ o_end => $span->o_end,
+ p_end => $span->p_end,
+ payload => '<b>1',
+ pti => 64
);
$i++;
}
diff --git a/lib/KorAP/XML/Annotation/Base/Sentences.pm b/lib/KorAP/XML/Annotation/Base/Sentences.pm
index 28c9434..1d66c8a 100644
--- a/lib/KorAP/XML/Annotation/Base/Sentences.pm
+++ b/lib/KorAP/XML/Annotation/Base/Sentences.pm
@@ -16,12 +16,12 @@
$first = [$span->p_start, $span->o_start] unless defined $first;
$mtt->add(
- term => '<>:base/s:s',
- o_start => $span->o_start,
- o_end => $span->o_end,
- p_end => $span->p_end,
- payload => '<b>2',
- pti => 64
+ term => '<>:base/s:s',
+ o_start => $span->o_start,
+ o_end => $span->o_end,
+ p_end => $span->p_end,
+ payload => '<b>2',
+ pti => 64
);
$last_p = $span->p_end;
$last_o = $span->o_end;
@@ -29,15 +29,15 @@
}
) or return;
-# my $mt = $$self->stream->pos($first->[0]);
-# $mt->add(
-# term => '<>:base/s:t',
-# o_start => $first->[1],
-# p_end => $last_p,
-# o_end => $last_o,
-# payload => '<b>0',
-# pti => 64
-# );
+ # my $mt = $$self->stream->pos($first->[0]);
+ # $mt->add(
+ # term => '<>:base/s:t',
+ # o_start => $first->[1],
+ # p_end => $last_p,
+ # o_end => $last_o,
+ # payload => '<b>0',
+ # pti => 64
+ # );
$$self->stream->add_meta('base/sentences', '<i>' . $i);
diff --git a/lib/KorAP/XML/Annotation/DeReKo/Structure.pm b/lib/KorAP/XML/Annotation/DeReKo/Structure.pm
index 80a284e..ced5476 100644
--- a/lib/KorAP/XML/Annotation/DeReKo/Structure.pm
+++ b/lib/KorAP/XML/Annotation/DeReKo/Structure.pm
@@ -1,9 +1,10 @@
package KorAP::XML::Annotation::DeReKo::Structure;
use KorAP::XML::Annotation::Base;
-use Data::Dumper;
sub parse {
my $self = shift;
+ my $as_base = shift // 0;
+ my ($sentences, $paragraphs) = (0,0);
$$self->add_spandata(
foundry => 'struct',
@@ -21,10 +22,10 @@
# Get attributes
if (ref $feature eq 'ARRAY') {
- $attrs = $feature->[1]->{fs}->{f};
- $attrs = ref $attrs eq 'ARRAY' ? $attrs : [$attrs];
- $feature = $feature->[0];
- $tui = $stream->tui($p_start);
+ $attrs = $feature->[1]->{fs}->{f};
+ $attrs = ref $attrs eq 'ARRAY' ? $attrs : [$attrs];
+ $feature = $feature->[0];
+ $tui = $stream->tui($p_start);
};
# Get term label
@@ -37,42 +38,71 @@
# Add structure
my $mt = $mtt->add(
- term => '<>:dereko/s:' . $name,
- o_start => $span->o_start,
- o_end => $span->o_end,
- p_start => $p_start,
- p_end => $p_end,
- pti => $span->milestone ? 65 : 64,
+ term => '<>:dereko/s:' . $name,
+ o_start => $span->o_start,
+ o_end => $span->o_end,
+ p_start => $p_start,
+ p_end => $p_end,
+ pti => $span->milestone ? 65 : 64,
);
my $level = $span->hash->{'-l'};
if ($level || $tui) {
- my $pl;
- $pl .= '<b>' . ($level ? $level - 1 : 0);
- $pl .= '<s>' . $tui if $tui;
- $mt->payload($pl);
+ my $pl;
+ $pl .= '<b>' . ($level ? $level - 1 : 0);
+ $pl .= '<s>' . $tui if $tui;
+ $mt->payload($pl);
+ };
+
+ # Use sentence and paragraph elements for base
+ if ($as_base && ($name eq 's' || $name eq 'p')) {
+
+ # Clone Multiterm
+ my $mt2 = $mt->clone;
+ $mt2->term('<>:base/s:' . $name);
+
+ if ($name eq 's' && index($as_base, 'sentences') >= 0) {
+ $mt2->payload('<b>2');
+ $sentences++;
+ }
+ elsif ($name eq 'p' && index($as_base, 'paragraphs') >= 0) {
+ $mt2->payload('<b>1');
+ $paragraphs++;
+ };
+
+ # Add to stream
+ $mtt->add($mt2);
};
# Add attributes
if ($attrs) {
- # Set a tui if attributes are set
- foreach (@$attrs) {
+ # Set a tui if attributes are set
+ foreach (@$attrs) {
- # Add attributes
- $mtt->add(
- term =>
- '@:dereko/s:' . $_->{'-name'} . ':' . $_->{'#text'},
- p_start => $p_start,
- pti => 17,
- payload => '<s>' . $tui .
- ($span->milestone ? '' : '<i>' . $p_end)
- );
- };
+ # Add attributes
+ $mtt->add(
+ term =>
+ '@:dereko/s:' . $_->{'-name'} . ':' . $_->{'#text'},
+ p_start => $p_start,
+ pti => 17,
+ payload => '<s>' . $tui .
+ ($span->milestone ? '' : '<i>' . $p_end)
+ );
+ };
};
}
) or return;
+ if ($as_base) {
+ if (index($as_base, 'sentences') >= 0) {
+ $$self->stream->add_meta('base/sentences', '<i>' . $sentences);
+ };
+ if (index($as_base, 'paragraphs') >= 0) {
+ $$self->stream->add_meta('base/paragraphs', '<i>' . $paragraphs);
+ };
+ };
+
return 1;
};
diff --git a/lib/KorAP/XML/Index/MultiTerm.pm b/lib/KorAP/XML/Index/MultiTerm.pm
index 87946e2..7e089de 100644
--- a/lib/KorAP/XML/Index/MultiTerm.pm
+++ b/lib/KorAP/XML/Index/MultiTerm.pm
@@ -128,8 +128,8 @@
'<i>' . $_[0]->[4];
};
-# my $pl = $_[0]->[1] ?
-# $_[0]->[1] - 1 : $_[0]->[0];
+ # my $pl = $_[0]->[1] ?
+ # $_[0]->[1] - 1 : $_[0]->[0];
if ($_[0]->[2] || $_[0]->[0]) {
@@ -139,10 +139,10 @@
};
if ($_[0]->[0]) {
if (index($_[0]->[0], '<') == 0) {
- $pre .= $_[0]->[0];
+ $pre .= $_[0]->[0];
}
else {
- $pre .= '<?>' . $_[0]->[0];
+ $pre .= '<?>' . $_[0]->[0];
};
};
};
@@ -151,6 +151,11 @@
};
+sub clone {
+ my $self = shift;
+ bless [@$self], __PACKAGE__;
+};
+
sub to_string_2 {
my $self = shift;
my $string = $self->term;
@@ -166,10 +171,10 @@
};
if ($self->payload) {
if (index($self->payload, '<') == 0) {
- $string .= $self->payload;
+ $string .= $self->payload;
}
else {
- $string .= '<?>' . $self->payload;
+ $string .= '<?>' . $self->payload;
};
};
};
diff --git a/lib/KorAP/XML/Index/MultiTermToken.pm b/lib/KorAP/XML/Index/MultiTermToken.pm
index 9e78df8..3556422 100644
--- a/lib/KorAP/XML/Index/MultiTermToken.pm
+++ b/lib/KorAP/XML/Index/MultiTermToken.pm
@@ -1,5 +1,6 @@
package KorAP::XML::Index::MultiTermToken;
use KorAP::XML::Index::MultiTerm;
+use Scalar::Util qw/blessed/;
use List::MoreUtils 'uniq';
use Carp qw/carp croak/;
use strict;
@@ -16,8 +17,9 @@
sub add {
my $self = shift;
+
my $mt;
- unless (ref $_[0] eq 'MultiTerm') {
+ unless (blessed $_[0]) {
if (@_ == 1) {
$mt = KorAP::XML::Index::MultiTerm->new(term => $_[0]);
}
diff --git a/lib/KorAP/XML/Krill.pm b/lib/KorAP/XML/Krill.pm
index 968d3d9..1f5fad8 100644
--- a/lib/KorAP/XML/Krill.pm
+++ b/lib/KorAP/XML/Krill.pm
@@ -15,7 +15,7 @@
use Data::Dumper;
use File::Spec::Functions qw/catdir catfile catpath splitdir splitpath rel2abs/;
-our $VERSION = '0.23';
+our $VERSION = '0.24';
has 'path';
has [qw/text_sigle doc_sigle corpus_sigle/];
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 27eb360..7974c76 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -73,12 +73,15 @@
# 2016/10/24
# - Added support for document extraction
#
-# 1016/10/27
+# 2016/10/27
# - Added wildcard support for document extraction
#
+# 2016/12/21
+# - added support for base-sentences and base-tokenizations
+#
# ----------------------------------------------------------
-our $LAST_CHANGE = '2016/10/27';
+our $LAST_CHANGE = '2016/12/21';
our $LOCAL = $FindBin::Bin;
our $VERSION_MSG = <<"VERSION";
Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
@@ -101,6 +104,8 @@
'overwrite|w' => \(my $overwrite),
'meta|m=s' => \(my $meta),
'token|t=s' => \(my $token_base = 'OpenNLP#tokens'),
+ 'base-sentences|bs=s' => \(my $base_sentences = ''),
+ 'base-paragraphs|bp=s' => \(my $base_paragraphs = ''),
'gzip|z' => \(my $gzip),
'skip|s=s' => \@skip,
'sigle|sg=s' => \@sigle,
@@ -130,6 +135,9 @@
}
);
+$base_sentences = lc $base_sentences;
+$base_paragraphs = lc $base_paragraphs;
+
my %ERROR_HASH = (
-sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
-verbose => 99,
@@ -158,8 +166,8 @@
$skip{lc($_)} = 1 foreach @skip;
my @layers;
-push(@layers, ['Base', 'Sentences']);
-push(@layers, ['Base', 'Paragraphs']);
+push(@layers, ['Base', 'Sentences']) unless $base_sentences;
+push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
# Connexor
push(@layers, ['Connexor', 'Morpho']);
@@ -173,8 +181,20 @@
push(@layers, ['CoreNLP', 'Morpho']);
push(@layers, ['CoreNLP', 'Constituency']);
+
# DeReKo
-push(@layers, ['DeReKo', 'Structure']);
+if ($base_sentences eq 'dereko#structure' && $base_paragraphs eq 'dereko#structure') {
+ push(@layers, ['DeReKo', 'Structure', 'base-sentences-paragraphs']);
+}
+elsif ($base_sentences eq 'dereko#structure') {
+ push(@layers, ['DeReKo', 'Structure', 'base-sentences']);
+}
+elsif ($base_paragraphs eq 'dereko#structure') {
+ push(@layers, ['DeReKo', 'Structure', 'base-paragraphs']);
+}
+else {
+ push(@layers, ['DeReKo', 'Structure']);
+};
# Glemm
push(@layers, ['Glemm', 'Morpho']);
@@ -670,12 +690,31 @@
Overwrite files that already exist.
-=item B<--token|-t> <foundry>[#<file>]
+=item B<--token|-t> <foundry>#<file>
Define the default tokenization by specifying
the name of the foundry and optionally the name
of the layer-file. Defaults to C<OpenNLP#tokens>.
+
+=item B<--base-sentences|-bs> <foundry>#<layer>
+
+Define the layer for base sentences.
+If given, this will be used instead of using C<Base#Sentences>.
+Currently C<DeReKo#Structure> is the only additional layer supported.
+
+ Defaults to unset.
+
+
+=item B<--base-paragraphs|-bp> <foundry>#<layer>
+
+Define the layer for base paragraphs.
+If given, this will be used instead of using C<Base#Paragraphs>.
+Currently C<DeReKo#Structure> is the only additional layer supported.
+
+ Defaults to unset.
+
+
=item B<--skip|-s> <foundry>[#<layer>]
Skip specific annotations by specifying the foundry
diff --git a/t/real/goethe-2.t b/t/real/goethe-2.t
index 2b4f720..d197617 100644
--- a/t/real/goethe-2.t
+++ b/t/real/goethe-2.t
@@ -128,18 +128,31 @@
ok(!exists $output->{docEditor}, 'Correct Text Type');
## Base
-$tokens->add('DeReKo', 'Structure');
+$tokens->add('DeReKo', 'Structure', 'base_sentences_paragraphs');
$output = $tokens->to_data;
-is($output->{data}->{foundries}, 'dereko dereko/structure', 'Foundries');
+is($output->{data}->{foundries}, 'dereko dereko/structure dereko/structure/base_sentences_paragraphs', 'Foundries');
is($output->{data}->{layerInfos}, 'dereko/s=spans', 'layerInfos');
my $first_token = join('||', @{$output->{data}->{stream}->[0]});
like($first_token, qr/s:Autobiographische/, 'data');
like($first_token, qr/_0\$<i>0<i>17/, 'data');
like($first_token, qr!<>:dereko/s:s\$<b>64<i>0<i>30<i>2<b>4!, 'data');
-#like($first_token, qr!<>:base/s:s\$<b>64<i>0<i>30<i>2<b>4!, 'data');
-#like($first_token, qr!<>:base\/s:t\$<b>64<i>0<i>35199<i>5226<b>0!, 'data');
+like($first_token, qr!<>:base\/s:t\$<b>64<i>0<i>35250<i>5233<b>0!, 'data');
+like($first_token, qr!<>:base/s:s\$<b>64<i>0<i>30<i>2<b>2!, 'data');
+like($first_token, qr!-:base\/paragraphs\$\<i\>14!, 'data');
+like($first_token, qr!-:base\/sentences\$\<i\>215!, 'data');
+
+# Check paragraph
+$first_token = join('||', @{$output->{data}->{stream}->[4]});
+like($first_token, qr/s:immer/, 'data');
+like($first_token, qr!<>:base\/s:s\$<b>64<i>53<i>254<i>32<b>2!, 'data');
+like($first_token, qr!<>:dereko\/s:s\$<b>64<i>53<i>254<i>32<b>5<s>1!, 'data');
+like($first_token, qr!<>:base/s:p\$\<b>64<i>53<i>3299<i>504<b>1!, 'data');
+like($first_token, qr!<>:dereko/s:p\$\<b>64<i>53<i>3299<i>504<b>4!, 'data');
+
+$first_token = join('||', @{$output->{data}->{stream}->[180]});
+like($first_token, qr/i:geschäften/, 'data');
done_testing;
__END__
diff --git a/t/script/base.t b/t/script/base.t
new file mode 100644
index 0000000..bd2d8e5
--- /dev/null
+++ b/t/script/base.t
@@ -0,0 +1,69 @@
+#/usr/bin/env perl
+use strict;
+use warnings;
+use File::Basename 'dirname';
+use File::Spec::Functions qw/catdir catfile/;
+use File::Temp qw/ :POSIX /;
+use Mojo::Util qw/slurp/;
+use Mojo::JSON qw/decode_json/;
+use IO::Uncompress::Gunzip;
+use Test::More;
+use Test::Output;
+use Data::Dumper;
+use utf8;
+
+my $f = dirname(__FILE__);
+my $script = catfile($f, '..', '..', 'script', 'korapxml2krill');
+
+my $input = catdir($f, '..', 'corpus', 'GOE2', 'AGA', '03828');
+ok(-d $input, 'Input directory found');
+
+my $output = tmpnam();
+
+ok(!-f $output, 'Output does not exist');
+
+my $call = join(
+ ' ',
+ 'perl', $script,
+ '--input' => $input,
+ '--output' => $output,
+ '-t' => 'Base#tokens_aggr',
+ '-bs' => 'DeReKo#Structure',
+ '-bp' => 'DeReKo#Structure',
+ '-l' => 'INFO'
+);
+
+# Test without compression
+stderr_like(
+ sub {
+ system($call);
+ },
+ qr!The code took!,
+ $call
+);
+
+ok(-f $output, 'Output does exist');
+ok((my $file = slurp $output), 'Slurp data');
+ok((my $json = decode_json $file), 'decode json');
+is($json->{textType}, 'Autobiographie', 'text type');
+is($json->{title}, 'Autobiographische Einzelheiten', 'Title');
+is($json->{data}->{tokenSource}, 'base#tokens_aggr', 'Title');
+is($json->{data}->{foundries}, 'dereko dereko/structure dereko/structure/base-sentences-paragraphs', 'Foundries');
+my $stream = $json->{data}->{stream};
+my $token = $stream->[0];
+is($token->[0], '-:base/paragraphs$<i>14', 'Paragraphs');
+is($token->[1], '-:base/sentences$<i>215', 'Sentences');
+
+is($token->[5], '<>:base/s:s$<b>64<i>0<i>30<i>2<b>2', 'struct');
+is($token->[7], '<>:dereko/s:s$<b>64<i>0<i>30<i>2<b>4', 'struct');
+is($token->[8], '<>:base/s:t$<b>64<i>0<i>35250<i>5238<b>0', 'struct');
+
+$token = $stream->[4];
+is($token->[0], '<>:base/s:s$<b>64<i>53<i>254<i>32<b>2', 'struct');
+is($token->[1], '<>:dereko/s:s$<b>64<i>53<i>254<i>32<b>5<s>1', 'struct');
+is($token->[2], '<>:base/s:p$<b>64<i>53<i>3299<i>504<b>1', 'struct');
+is($token->[3], '<>:dereko/s:p$<b>64<i>53<i>3299<i>504<b>4', 'struct');
+
+done_testing;
+
+__END__