Added pagebreak annotations (with '~'-prefix)
Change-Id: I1e484756cedfd2450da55b031a8749ca1f98b891
diff --git a/Changes b/Changes
index 8c4e120..dfbeed5 100644
--- a/Changes
+++ b/Changes
@@ -1,10 +1,11 @@
-0.25 2017-02-06
+0.25 2017-02-08
- Updated to Mojolicious 7.20
- Fixed meta treatment in case analytic and monogr
are available
- Added DRuKoLa support to script
- Liberated document and text sigle handling to be
compliant with CoRoLa.
+ - Added support for pagebreak annotations.
0.24 2016-12-21
- Added --base-sentences and --base-paragraphs options
diff --git a/lib/KorAP/XML/Annotation/DeReKo/Structure.pm b/lib/KorAP/XML/Annotation/DeReKo/Structure.pm
index ced5476..7bd12f2 100644
--- a/lib/KorAP/XML/Annotation/DeReKo/Structure.pm
+++ b/lib/KorAP/XML/Annotation/DeReKo/Structure.pm
@@ -1,5 +1,7 @@
package KorAP::XML::Annotation::DeReKo::Structure;
use KorAP::XML::Annotation::Base;
+use List::Util qw/first/;
+use Scalar::Util qw/looks_like_number/;
sub parse {
my $self = shift;
@@ -55,23 +57,39 @@
};
# Use sentence and paragraph elements for base
- if ($as_base && ($name eq 's' || $name eq 'p')) {
-
- # Clone Multiterm
- my $mt2 = $mt->clone;
- $mt2->term('<>:base/s:' . $name);
+ if ($as_base && ($name eq 's' || $name eq 'p' || $name eq 'pb')) {
if ($name eq 's' && index($as_base, 'sentences') >= 0) {
+ # Clone Multiterm
+ my $mt2 = $mt->clone;
+ $mt2->term('<>:base/s:' . $name);
$mt2->payload('<b>2');
$sentences++;
+
+ # Add to stream
+ $mtt->add($mt2);
}
elsif ($name eq 'p' && index($as_base, 'paragraphs') >= 0) {
+ # Clone Multiterm
+ my $mt2 = $mt->clone;
+ $mt2->term('<>:base/s:' . $name);
$mt2->payload('<b>1');
$paragraphs++;
- };
- # Add to stream
- $mtt->add($mt2);
+ # Add to stream
+ $mtt->add($mt2);
+ }
+
+ # Add pagebreaks
+ elsif ($name eq 'pb' && index($as_base, 'pagebreaks') >= 0) {
+ if (my $nr = first { $_->{-name} eq 'n' } @$attrs) {
+ if (($nr = $nr->{'#text'}) && looks_like_number($nr)) {
+ my $mt2 = $mtt->add('~:base/s:pb');
+ $mt2->payload('<i>' . $nr . '<i>' . $span->o_start);
+ $mt2->store_offsets(0);
+ };
+ };
+ };
};
# Add attributes
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 81497fe..fab6147 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -82,9 +82,12 @@
# 2017/01/20
# - added support for DRuKoLa annotations
#
+# 2017/02/08
+# - added support for pagebreak annotations
+#
# ----------------------------------------------------------
-our $LAST_CHANGE = '2017/01/20';
+our $LAST_CHANGE = '2017/02/08';
our $LOCAL = $FindBin::Bin;
our $VERSION_MSG = <<"VERSION";
Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
@@ -109,6 +112,7 @@
'token|t=s' => \(my $token_base = 'OpenNLP#tokens'),
'base-sentences|bs=s' => \(my $base_sentences = ''),
'base-paragraphs|bp=s' => \(my $base_paragraphs = ''),
+ 'base-pagebreaks|bpb=s' => \(my $base_pagebreaks = ''),
'gzip|z' => \(my $gzip),
'skip|s=s' => \@skip,
'sigle|sg=s' => \@sigle,
@@ -186,14 +190,19 @@
# DeReKo
-if ($base_sentences eq 'dereko#structure' && $base_paragraphs eq 'dereko#structure') {
- push(@layers, ['DeReKo', 'Structure', 'base-sentences-paragraphs']);
-}
-elsif ($base_sentences eq 'dereko#structure') {
- push(@layers, ['DeReKo', 'Structure', 'base-sentences']);
-}
-elsif ($base_paragraphs eq 'dereko#structure') {
- push(@layers, ['DeReKo', 'Structure', 'base-paragraphs']);
+my @dereko_attr = ();
+if ($base_sentences eq 'dereko#structure') {
+ push @dereko_attr, 'sentences';
+};
+if ($base_paragraphs eq 'dereko#structure') {
+ push @dereko_attr, 'paragraphs';
+};
+if ($base_pagebreaks eq 'dereko#structure') {
+ push @dereko_attr, 'pagebreaks';
+};
+
+if ($dereko_attr[0]) {
+ push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
}
else {
push(@layers, ['DeReKo', 'Structure']);
@@ -722,6 +731,14 @@
Defaults to unset.
+=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
+
+Define the layer for base pagebreaks.
+Currently C<DeReKo#Structure> is the only layer supported.
+
+ Defaults to unset.
+
+
=item B<--skip|-s> <foundry>[#<layer>]
Skip specific annotations by specifying the foundry
diff --git a/t/annotation/dereko_struct.t b/t/annotation/dereko_struct.t
index 8f5f18a..5569369 100644
--- a/t/annotation/dereko_struct.t
+++ b/t/annotation/dereko_struct.t
@@ -10,7 +10,7 @@
ok(my $tokens = TestInit::tokens('0001'), 'Parse tokens');
-ok($tokens->add('DeReKo', 'Structure'), 'Add Structure');
+ok($tokens->add('DeReKo', 'Structure', 'pagebreaks'), 'Add Structure');
my $data = $tokens->to_data->{data};
@@ -38,8 +38,6 @@
'@:dereko/s:version:1.1$<b>17<s>2',
'Attribute of idsHeader');
-
-
is($data->{stream}->[0]->[14],
'@:dereko/s:pattern:text$<b>17<s>2',
'Attribute of idsHeader');
@@ -56,6 +54,10 @@
'<>:dereko/s:pb$<b>65<i>42<i>42<i>6<b>6<s>1',
'Pagebreak element');
+is($data->{stream}->[6]->[-1],
+ '~:base/s:pb$<i>2<i>42',
+ 'Pagebreak element');
+
done_testing;
__END__