Added support for pagebreaks (i.e. empty elements)

Change-Id: Ifa9114601f3d9fd954c5ba493a268e75ed9b834e
diff --git a/Changes b/Changes
index 4ff26ea..033ccaa 100644
--- a/Changes
+++ b/Changes
@@ -1,13 +1,18 @@
-0.3 2014-11-03
+0.04 2016-01-17
+        - Added PTI to all payloads.
+	- Added support for empty elements.
+	- Added support for element attributes in struct.
+
+0.03 2014-11-03
         - Added new metadata scheme.
 	- Fixed a minor bug in the constituency tree building.
 	- Sorted terms in tokens a priori.
 
-0.2 2014-07-21
+0.02 2014-07-21
         - Sentence annotations for all providing foundries
 	- Starting subtokenization 
 
-0.1 2014-04-15
+0.01 2014-04-15
 	- [bugfix] for first token annotations
         - Sentences are now available from all foundries that have it
         - <>:p is now <>:base/para
diff --git a/lib/KorAP/Field/MultiTerm.pm b/lib/KorAP/Field/MultiTerm.pm
index ddd6880..835749d 100644
--- a/lib/KorAP/Field/MultiTerm.pm
+++ b/lib/KorAP/Field/MultiTerm.pm
@@ -119,7 +119,9 @@
 #    $_[0]->[1] - 1 : $_[0]->[0];
 
   if ($_[0]->[2] || $_[0]->[0]) {
-    if ($_[0]->[2]) {
+
+    # p_end
+    if (defined $_[0]->[2]) {
       $pre .= '<i>' . $_[0]->[2];
     };
     if ($_[0]->[0]) {
diff --git a/lib/KorAP/Indexer.pm b/lib/KorAP/Indexer.pm
index 94503a4..a428437 100644
--- a/lib/KorAP/Indexer.pm
+++ b/lib/KorAP/Indexer.pm
@@ -1,5 +1,5 @@
 package KorAP::Indexer;
 
-our $VERSION = 0.03;
+our $VERSION = 0.04;
 
 1;
diff --git a/lib/KorAP/Tokenizer/Spans.pm b/lib/KorAP/Tokenizer/Spans.pm
index 4ba81b1..eaf1549 100644
--- a/lib/KorAP/Tokenizer/Spans.pm
+++ b/lib/KorAP/Tokenizer/Spans.pm
@@ -2,9 +2,9 @@
 use strict;
 use warnings;
 use KorAP::Log;
+use Data::Dumper;
 use Mojo::Base 'KorAP::Tokenizer::Units';
 use KorAP::Tokenizer::Span;
-# use Mojo::DOM;
 use Mojo::ByteStream 'b';
 use XML::Fast;
 use Try::Tiny;
diff --git a/lib/KorAP/Tokenizer/Units.pm b/lib/KorAP/Tokenizer/Units.pm
index 5894b93..085a9fa 100644
--- a/lib/KorAP/Tokenizer/Units.pm
+++ b/lib/KorAP/Tokenizer/Units.pm
@@ -41,19 +41,24 @@
 
   $span->p_start($start);
 
-  my $end = $self->match->endswith($span->o_end);
+  if ($span->milestone) {
+    $span->p_end($start);
+  }
+  else {
+    my $end = $self->match->endswith($span->o_end);
 
-  unless (defined $end) {
-    $end = $self->range->before($span->o_end);
-    return unless defined $end;
-  };
+    unless (defined $end) {
+      $end = $self->range->before($span->o_end);
+      return unless defined $end;
+    };
 
-  # $span->p_end($end);
-  # return unless $span->p_end >= $span->p_start;
+    # $span->p_end($end);
+    # return unless $span->p_end >= $span->p_start;
 
-  # EXPERIMENTAL:
-  return unless $end >= $span->p_start;
-  $span->p_end($end + 1);
+    # EXPERIMENTAL:
+    return unless $end >= $span->p_start;
+    $span->p_end($end + 1);
+  }
 
   $span->hash($s) if $s;
 
diff --git a/t/index/corpus/doc/0001/struct/structure.xml b/t/index/corpus/doc/0001/struct/structure.xml
index 68d4dc2..30edc1e 100644
--- a/t/index/corpus/doc/0001/struct/structure.xml
+++ b/t/index/corpus/doc/0001/struct/structure.xml
@@ -1,7 +1,7 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <?xml-model href="span.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
 
-<layer docid="A01_APR.13047" xmlns="http://ids-mannheim.de/ns/KorAP" version="KorAP-0.4">
+<layer docid="Corpus_Doc.0001" xmlns="http://ids-mannheim.de/ns/KorAP" version="KorAP-0.4">
   <spanList>
     <span id="s0" from="0" to="128" l="1">
       <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
@@ -89,6 +89,18 @@
         </f>
       </fs>
     </span>
+    <span id="sx" from="42" to="42" l="7">
+      <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
+        <f name="name">pb</f>
+        <f name="attr">
+          <fs type="attr">
+            <f name="n">2</f>
+            <f name="id">Corpus.Doc-0001-pb2</f>
+            <f name="TEIform">pb</f>
+          </fs>
+        </f>
+      </fs>
+    </span>
     <span id="s10" from="43" to="128" l="6">
       <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
         <f name="name">div</f>
diff --git a/t/index/dereko_struct.t b/t/index/dereko_struct.t
index 30c41f0..beaf248 100644
--- a/t/index/dereko_struct.t
+++ b/t/index/dereko_struct.t
@@ -21,7 +21,7 @@
 
 # Empty element (from 0 to 0) on level 1, with TUI 2
 is($data->{stream}->[0]->[1],
-   '<>:dereko/s:idsHeader$<b>65<i>0<i>0<i>1<b>1<s>2',
+   '<>:dereko/s:idsHeader$<b>65<i>0<i>0<i>0<b>1<s>2',
    'Empty element');
 
 # Attributes:
@@ -46,7 +46,17 @@
    'Attribute of idsHeader');
 
 
-diag 'TODO: Test for element spans';
+is($data->{stream}->[4]->[1],
+   '<>:dereko/s:s$<b>64<i>32<i>42<i>6<b>6<s>1',
+   'Sentence span');
+
+is($data->{stream}->[4]->[2],
+   '@:dereko/s:broken:no$<b>17<s>1<i>6',
+   'Attribute of sentence span');
+
+is($data->{stream}->[6]->[0],
+   '<>:dereko/s:pb$<b>65<i>42<i>42<i>6<b>6<s>1',
+   'Pagebreak element');
 
 done_testing;