Added pagebreak annotations (with '~'-prefix)

Change-Id: I1e484756cedfd2450da55b031a8749ca1f98b891
diff --git a/Changes b/Changes
index 8c4e120..dfbeed5 100644
--- a/Changes
+++ b/Changes
@@ -1,10 +1,11 @@
-0.25 2017-02-06
+0.25 2017-02-08
         - Updated to Mojolicious 7.20
         - Fixed meta treatment in case analytic and monogr
           are available
         - Added DRuKoLa support to script
         - Liberated document and text sigle handling to be
           compliant with CoRoLa.
+        - Added support for pagebreak annotations.
 
 0.24 2016-12-21
         - Added --base-sentences and --base-paragraphs options
diff --git a/lib/KorAP/XML/Annotation/DeReKo/Structure.pm b/lib/KorAP/XML/Annotation/DeReKo/Structure.pm
index ced5476..7bd12f2 100644
--- a/lib/KorAP/XML/Annotation/DeReKo/Structure.pm
+++ b/lib/KorAP/XML/Annotation/DeReKo/Structure.pm
@@ -1,5 +1,7 @@
 package KorAP::XML::Annotation::DeReKo::Structure;
 use KorAP::XML::Annotation::Base;
+use List::Util qw/first/;
+use Scalar::Util qw/looks_like_number/;
 
 sub parse {
   my $self = shift;
@@ -55,23 +57,39 @@
       };
 
       # Use sentence and paragraph elements for base
-      if ($as_base && ($name eq 's' || $name eq 'p')) {
-
-        # Clone Multiterm
-        my $mt2 = $mt->clone;
-        $mt2->term('<>:base/s:' . $name);
+      if ($as_base && ($name eq 's' || $name eq 'p' || $name eq 'pb')) {
 
         if ($name eq 's' && index($as_base, 'sentences') >= 0) {
+          # Clone Multiterm
+          my $mt2 = $mt->clone;
+          $mt2->term('<>:base/s:' . $name);
           $mt2->payload('<b>2');
           $sentences++;
+
+          # Add to stream
+          $mtt->add($mt2);
         }
         elsif ($name eq 'p' && index($as_base, 'paragraphs') >= 0) {
+          # Clone Multiterm
+          my $mt2 = $mt->clone;
+          $mt2->term('<>:base/s:' . $name);
           $mt2->payload('<b>1');
           $paragraphs++;
-        };
 
-        # Add to stream
-        $mtt->add($mt2);
+          # Add to stream
+          $mtt->add($mt2);
+        }
+
+        # Add pagebreaks
+        elsif ($name eq 'pb' && index($as_base, 'pagebreaks') >= 0) {
+          if (my $nr = first { $_->{-name} eq 'n' } @$attrs) {
+            if (($nr = $nr->{'#text'}) && looks_like_number($nr)) {
+              my $mt2 = $mtt->add('~:base/s:pb');
+              $mt2->payload('<i>' . $nr . '<i>' . $span->o_start);
+              $mt2->store_offsets(0);
+            };
+          };
+        };
       };
 
       # Add attributes
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 81497fe..fab6147 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -82,9 +82,12 @@
 # 2017/01/20
 # - added support for DRuKoLa annotations
 #
+# 2017/02/08
+# - added support for pagebreak annotations
+#
 # ----------------------------------------------------------
 
-our $LAST_CHANGE = '2017/01/20';
+our $LAST_CHANGE = '2017/02/08';
 our $LOCAL = $FindBin::Bin;
 our $VERSION_MSG = <<"VERSION";
 Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
@@ -109,6 +112,7 @@
   'token|t=s'   => \(my $token_base = 'OpenNLP#tokens'),
   'base-sentences|bs=s' => \(my $base_sentences = ''),
   'base-paragraphs|bp=s' => \(my $base_paragraphs = ''),
+  'base-pagebreaks|bpb=s' => \(my $base_pagebreaks = ''),
   'gzip|z'      => \(my $gzip),
   'skip|s=s'    => \@skip,
   'sigle|sg=s'  => \@sigle,
@@ -186,14 +190,19 @@
 
 
 # DeReKo
-if ($base_sentences eq 'dereko#structure' && $base_paragraphs eq 'dereko#structure') {
-  push(@layers, ['DeReKo', 'Structure', 'base-sentences-paragraphs']);
-}
-elsif ($base_sentences eq 'dereko#structure') {
-  push(@layers, ['DeReKo', 'Structure', 'base-sentences']);
-}
-elsif ($base_paragraphs eq 'dereko#structure') {
-  push(@layers, ['DeReKo', 'Structure', 'base-paragraphs']);
+my @dereko_attr = ();
+if ($base_sentences eq 'dereko#structure') {
+  push @dereko_attr, 'sentences';
+};
+if ($base_paragraphs eq 'dereko#structure') {
+  push @dereko_attr, 'paragraphs';
+};
+if ($base_pagebreaks eq 'dereko#structure') {
+  push @dereko_attr, 'pagebreaks';
+};
+
+if ($dereko_attr[0]) {
+  push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
 }
 else {
   push(@layers, ['DeReKo', 'Structure']);
@@ -722,6 +731,14 @@
  Defaults to unset.
 
 
+=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
+
+Define the layer for base pagebreaks.
+Currently C<DeReKo#Structure> is the only layer supported.
+
+ Defaults to unset.
+
+
 =item B<--skip|-s> <foundry>[#<layer>]
 
 Skip specific annotations by specifying the foundry
diff --git a/t/annotation/dereko_struct.t b/t/annotation/dereko_struct.t
index 8f5f18a..5569369 100644
--- a/t/annotation/dereko_struct.t
+++ b/t/annotation/dereko_struct.t
@@ -10,7 +10,7 @@
 
 ok(my $tokens = TestInit::tokens('0001'), 'Parse tokens');
 
-ok($tokens->add('DeReKo', 'Structure'), 'Add Structure');
+ok($tokens->add('DeReKo', 'Structure', 'pagebreaks'), 'Add Structure');
 
 my $data = $tokens->to_data->{data};
 
@@ -38,8 +38,6 @@
    '@:dereko/s:version:1.1$<b>17<s>2',
    'Attribute of idsHeader');
 
-
-
 is($data->{stream}->[0]->[14],
    '@:dereko/s:pattern:text$<b>17<s>2',
    'Attribute of idsHeader');
@@ -56,6 +54,10 @@
    '<>:dereko/s:pb$<b>65<i>42<i>42<i>6<b>6<s>1',
    'Pagebreak element');
 
+is($data->{stream}->[6]->[-1],
+   '~:base/s:pb$<i>2<i>42',
+   'Pagebreak element');
+
 done_testing;
 
 __END__