Added support for pagebreaks (i.e. empty elements)
Change-Id: Ifa9114601f3d9fd954c5ba493a268e75ed9b834e
diff --git a/Changes b/Changes
index 4ff26ea..033ccaa 100644
--- a/Changes
+++ b/Changes
@@ -1,13 +1,18 @@
-0.3 2014-11-03
+0.04 2016-01-17
+ - Added PTI to all payloads.
+ - Added support for empty elements.
+ - Added support for element attributes in struct.
+
+0.03 2014-11-03
- Added new metadata scheme.
- Fixed a minor bug in the constituency tree building.
- Sorted terms in tokens a priori.
-0.2 2014-07-21
+0.02 2014-07-21
- Sentence annotations for all providing foundries
- Starting subtokenization
-0.1 2014-04-15
+0.01 2014-04-15
- [bugfix] for first token annotations
- Sentences are now available from all foundries that have it
- <>:p is now <>:base/para
diff --git a/lib/KorAP/Field/MultiTerm.pm b/lib/KorAP/Field/MultiTerm.pm
index ddd6880..835749d 100644
--- a/lib/KorAP/Field/MultiTerm.pm
+++ b/lib/KorAP/Field/MultiTerm.pm
@@ -119,7 +119,9 @@
# $_[0]->[1] - 1 : $_[0]->[0];
if ($_[0]->[2] || $_[0]->[0]) {
- if ($_[0]->[2]) {
+
+ # p_end
+ if (defined $_[0]->[2]) {
$pre .= '<i>' . $_[0]->[2];
};
if ($_[0]->[0]) {
diff --git a/lib/KorAP/Indexer.pm b/lib/KorAP/Indexer.pm
index 94503a4..a428437 100644
--- a/lib/KorAP/Indexer.pm
+++ b/lib/KorAP/Indexer.pm
@@ -1,5 +1,5 @@
package KorAP::Indexer;
-our $VERSION = 0.03;
+our $VERSION = 0.04;
1;
diff --git a/lib/KorAP/Tokenizer/Spans.pm b/lib/KorAP/Tokenizer/Spans.pm
index 4ba81b1..eaf1549 100644
--- a/lib/KorAP/Tokenizer/Spans.pm
+++ b/lib/KorAP/Tokenizer/Spans.pm
@@ -2,9 +2,9 @@
use strict;
use warnings;
use KorAP::Log;
+use Data::Dumper;
use Mojo::Base 'KorAP::Tokenizer::Units';
use KorAP::Tokenizer::Span;
-# use Mojo::DOM;
use Mojo::ByteStream 'b';
use XML::Fast;
use Try::Tiny;
diff --git a/lib/KorAP/Tokenizer/Units.pm b/lib/KorAP/Tokenizer/Units.pm
index 5894b93..085a9fa 100644
--- a/lib/KorAP/Tokenizer/Units.pm
+++ b/lib/KorAP/Tokenizer/Units.pm
@@ -41,19 +41,24 @@
$span->p_start($start);
- my $end = $self->match->endswith($span->o_end);
+ if ($span->milestone) {
+ $span->p_end($start);
+ }
+ else {
+ my $end = $self->match->endswith($span->o_end);
- unless (defined $end) {
- $end = $self->range->before($span->o_end);
- return unless defined $end;
- };
+ unless (defined $end) {
+ $end = $self->range->before($span->o_end);
+ return unless defined $end;
+ };
- # $span->p_end($end);
- # return unless $span->p_end >= $span->p_start;
+ # $span->p_end($end);
+ # return unless $span->p_end >= $span->p_start;
- # EXPERIMENTAL:
- return unless $end >= $span->p_start;
- $span->p_end($end + 1);
+ # EXPERIMENTAL:
+ return unless $end >= $span->p_start;
+ $span->p_end($end + 1);
+ }
$span->hash($s) if $s;
diff --git a/t/index/corpus/doc/0001/struct/structure.xml b/t/index/corpus/doc/0001/struct/structure.xml
index 68d4dc2..30edc1e 100644
--- a/t/index/corpus/doc/0001/struct/structure.xml
+++ b/t/index/corpus/doc/0001/struct/structure.xml
@@ -1,7 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<?xml-model href="span.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
-<layer docid="A01_APR.13047" xmlns="http://ids-mannheim.de/ns/KorAP" version="KorAP-0.4">
+<layer docid="Corpus_Doc.0001" xmlns="http://ids-mannheim.de/ns/KorAP" version="KorAP-0.4">
<spanList>
<span id="s0" from="0" to="128" l="1">
<fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
@@ -89,6 +89,18 @@
</f>
</fs>
</span>
+ <span id="sx" from="42" to="42" l="7">
+ <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="name">pb</f>
+ <f name="attr">
+ <fs type="attr">
+ <f name="n">2</f>
+ <f name="id">Corpus.Doc-0001-pb2</f>
+ <f name="TEIform">pb</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
<span id="s10" from="43" to="128" l="6">
<fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
<f name="name">div</f>
diff --git a/t/index/dereko_struct.t b/t/index/dereko_struct.t
index 30c41f0..beaf248 100644
--- a/t/index/dereko_struct.t
+++ b/t/index/dereko_struct.t
@@ -21,7 +21,7 @@
# Empty element (from 0 to 0) on level 1, with TUI 2
is($data->{stream}->[0]->[1],
- '<>:dereko/s:idsHeader$<b>65<i>0<i>0<i>1<b>1<s>2',
+ '<>:dereko/s:idsHeader$<b>65<i>0<i>0<i>0<b>1<s>2',
'Empty element');
# Attributes:
@@ -46,7 +46,17 @@
'Attribute of idsHeader');
-diag 'TODO: Test for element spans';
+is($data->{stream}->[4]->[1],
+ '<>:dereko/s:s$<b>64<i>32<i>42<i>6<b>6<s>1',
+ 'Sentence span');
+
+is($data->{stream}->[4]->[2],
+ '@:dereko/s:broken:no$<b>17<s>1<i>6',
+ 'Attribute of sentence span');
+
+is($data->{stream}->[6]->[0],
+ '<>:dereko/s:pb$<b>65<i>42<i>42<i>6<b>6<s>1',
+ 'Pagebreak element');
done_testing;