Fixed gap behind last token and <base/s:t> length
Change-Id: I7b8d9cc90280c29d3ba90a8f97ddb63315dc8b0c
diff --git a/Changes b/Changes
index a351a92..9acb4a1 100644
--- a/Changes
+++ b/Changes
@@ -1,10 +1,12 @@
-0.40 2020-03-01
+0.40 2020-03-03
- Fixed XIP parser.
- Added example corpus of the
Redewiedergabe-Korpus.
- Fixed span offset bug.
- Fixed milestones behind the last
token bug.
+ - Fixed gap behind last token bug.
+ - Fixed <base/s:t> length.
0.39 2020-02-19
- Added Talismane support.
diff --git a/lib/KorAP/XML/Tokenizer.pm b/lib/KorAP/XML/Tokenizer.pm
index f9595dd..59ae17f 100644
--- a/lib/KorAP/XML/Tokenizer.pm
+++ b/lib/KorAP/XML/Tokenizer.pm
@@ -138,18 +138,19 @@
$should++;
# Ignore non-word, non-number, and non-verbal tokens per default
+ # '9646' equals the musical pause, used in speech corpora
if ($self->non_verbal_tokens && ord($token) == 9646) {
# Non-verbal token
} elsif (!$self->non_word_tokens && $token !~ /[\w\d]/) {
# TODO: Recognize punctuations!
- # if ($mtt) {
- # my $term = [$token, $from, $to];
- # $mtt->add(
- # term => '.>:'.$token,
- # payload => '<i>'.$from . '<i>' . $to . '<b>' . $distance++
- # );
- # push(@non_word_tokens, $term);
- # }
+ # if ($mtt) {
+ # my $term = [$token, $from, $to];
+ # $mtt->add(
+ # term => '.>:'.$token,
+ # payload => '<i>'.$from . '<i>' . $to . '<b>' . $distance++
+ # );
+ # push(@non_word_tokens, $term);
+ # }
next;
};
@@ -157,12 +158,12 @@
$mtt = $mtts->add;
# while (scalar @non_word_tokens) {
- # local $_ = shift @non_word_tokens;
- # $mtt->add(
- # term => '.<:' . $_->[0],
- # payload => '<i>' . $_->[1] . '<i>' . $_->[2] . '<b>' . --$distance
- # );
- # $distance = 0;
+ # local $_ = shift @non_word_tokens;
+ # $mtt->add(
+ # term => '.<:' . $_->[0],
+ # payload => '<i>' . $_->[1] . '<i>' . $_->[2] . '<b>' . --$distance
+ # );
+ # $distance = 0;
# };
# Add gap for later finding matching positions before or after
@@ -208,15 +209,15 @@
$mtts->pos(0)->add(
term => '<>:base/s:t',
o_start => 0,
- p_end => ($have - 1),
+ p_end => $have,
o_end => $doc->primary->data_length,
payload => '<b>0',
pti => 64
);
- # Create a gap for the
+ # Create a gap for the end
if ($doc->primary->data_length >= ($old - 1)) {
- $range->gap($old, $doc->primary->data_length + 1, $have-1)
+ $range->gap($old, $doc->primary->data_length + 1, $have)
};
# Add info
@@ -256,27 +257,27 @@
my $from = $-[1];
my $to = $+[1];
$mtt->add(
- term => 'i^1:' . substr($os, $from, $from + $to),
- o_start => $from + $o_start,
- o_end => $to + $o_start
+ term => 'i^1:' . substr($os, $from, $from + $to),
+ o_start => $from + $o_start,
+ o_end => $to + $o_start
) unless $to - $from == $l;
};
while ($s =~ /(0+)[^0]/g) {
my $from = $-[1];
my $to = $+[1];
$mtt->add(
- term => 'i^2:' . substr($os, $from, $from + $to),
- o_start => $from + $o_start,
- o_end => $to + $o_start
+ term => 'i^2:' . substr($os, $from, $from + $to),
+ o_start => $from + $o_start,
+ o_end => $to + $o_start
) unless $to - $from == $l;
};
while ($s =~ /(#)/g) {
my $from = $-[1];
my $to = $+[1];
$mtt->add(
- term => 'i^3:' . substr($os, $from, $from + $to),
- o_start => $from + $o_start,
- o_end => $to + $o_start
+ term => 'i^3:' . substr($os, $from, $from + $to),
+ o_start => $from + $o_start,
+ o_end => $to + $o_start
) unless $to - $from == $l;
};
};
@@ -772,10 +773,10 @@
my ($stream, $span) = @_;
my $mtt = $stream->pos($span->p_start);
$mtt->add(
- term => '<>:s',
- o_start => $span->o_start,
- o_end => $span->o_end,
- p_end => $span->p_end
+ term => '<>:s',
+ o_start => $span->o_start,
+ o_end => $span->o_end,
+ p_end => $span->p_end
);
}
);
@@ -804,9 +805,9 @@
# syntax
if ((my $found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
- $mtt->add(
- term => 'cnx_syn:' . $found
- );
+ $mtt->add(
+ term => 'cnx_syn:' . $found
+ );
};
});
diff --git a/lib/KorAP/XML/Tokenizer/Units.pm b/lib/KorAP/XML/Tokenizer/Units.pm
index 22f50d5..3ace9eb 100644
--- a/lib/KorAP/XML/Tokenizer/Units.pm
+++ b/lib/KorAP/XML/Tokenizer/Units.pm
@@ -70,6 +70,10 @@
unless (defined $end) {
$end = $self->range->before($span->o_end);
+ if (DEBUG && $span->o_end == 196) {
+ warn 'SPAN ends at ' . $span->o_end . ' and has ' . $end;
+ };
+
unless (defined $end) {
if (DEBUG) {
warn $span->id . ' has no valid end';
diff --git a/t/annotation/base_paragraphs.t b/t/annotation/base_paragraphs.t
index e5d0c6b..9f02f27 100644
--- a/t/annotation/base_paragraphs.t
+++ b/t/annotation/base_paragraphs.t
@@ -17,8 +17,8 @@
like($data->{foundries}, qr!base/paragraphs!, 'data');
is($data->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Number of paragraphs');
is($data->{stream}->[0]->[1], '-:tokens$<i>18', 'Number of tokens');
-is($data->{stream}->[0]->[2], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Text');
-is($data->{stream}->[0]->[3], '<>:base/s:p$<b>64<i>0<i>129<i>17<b>1', 'Paragraph');
+is($data->{stream}->[0]->[2], '<>:base/s:t$<b>64<i>0<i>129<i>18<b>0', 'Text');
+is($data->{stream}->[0]->[3], '<>:base/s:p$<b>64<i>0<i>129<i>18<b>1', 'Paragraph');
is($data->{stream}->[0]->[4], '_0$<i>0<i>3', 'Position');
done_testing;
diff --git a/t/annotation/base_sentences.t b/t/annotation/base_sentences.t
index 69bb313..8096e5b 100644
--- a/t/annotation/base_sentences.t
+++ b/t/annotation/base_sentences.t
@@ -17,8 +17,8 @@
like($data->{foundries}, qr!base/sentences!, 'data');
is($data->{stream}->[0]->[0], '-:base/sentences$<i>1', 'Number of paragraphs');
is($data->{stream}->[0]->[1], '-:tokens$<i>18', 'Number of tokens');
-is($data->{stream}->[0]->[2], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Text');
-is($data->{stream}->[0]->[3], '<>:base/s:s$<b>64<i>0<i>129<i>17<b>2', 'Sentence');
+is($data->{stream}->[0]->[2], '<>:base/s:t$<b>64<i>0<i>129<i>18<b>0', 'Text');
+is($data->{stream}->[0]->[3], '<>:base/s:s$<b>64<i>0<i>129<i>18<b>2', 'Sentence');
is($data->{stream}->[0]->[4], '_0$<i>0<i>3', 'Position');
done_testing;
diff --git a/t/annotation/connexor_morpho.t b/t/annotation/connexor_morpho.t
index de0f704..d426571 100644
--- a/t/annotation/connexor_morpho.t
+++ b/t/annotation/connexor_morpho.t
@@ -14,7 +14,7 @@
my $data = $tokens->to_data->{data};
like($data->{foundries}, qr!connexor/morpho!, 'data');
-is($data->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Text boundary');
+is($data->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>18<b>0', 'Text boundary');
is($data->{stream}->[0]->[2], '_0$<i>0<i>3', 'Position');
is($data->{stream}->[1]->[1], 'cnx/l:letzt', 'Lemma');
is($data->{stream}->[1]->[2], 'cnx/p:A', 'POS');
diff --git a/t/annotation/connexor_phrase.t b/t/annotation/connexor_phrase.t
index a2cf6e3..f3df759 100644
--- a/t/annotation/connexor_phrase.t
+++ b/t/annotation/connexor_phrase.t
@@ -15,7 +15,7 @@
my $data = $tokens->to_data->{data};
like($data->{foundries}, qr!connexor/phrase!, 'data');
-is($data->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Text boundary');
+is($data->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>18<b>0', 'Text boundary');
is($data->{stream}->[1]->[0], '<>:cnx/c:np$<b>64<i>4<i>30<i>4<b>0', 'Noun phrase');
done_testing;
diff --git a/t/annotation/connexor_sentences.t b/t/annotation/connexor_sentences.t
index 0c2891c..37db3fb 100644
--- a/t/annotation/connexor_sentences.t
+++ b/t/annotation/connexor_sentences.t
@@ -42,8 +42,8 @@
like($data->{foundries}, qr!connexor/sentences!, 'data');
is($data->{stream}->[0]->[0], '-:cnx/sentences$<i>1', 'Number of paragraphs');
is($data->{stream}->[0]->[1], '-:tokens$<i>18', 'Number of tokens');
-is($data->{stream}->[0]->[2], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Text boundary');
-is($data->{stream}->[0]->[3], '<>:cnx/s:s$<b>64<i>0<i>129<i>17<b>0', 'Sentence');
+is($data->{stream}->[0]->[2], '<>:base/s:t$<b>64<i>0<i>129<i>18<b>0', 'Text boundary');
+is($data->{stream}->[0]->[3], '<>:cnx/s:s$<b>64<i>0<i>129<i>18<b>0', 'Sentence');
is($data->{stream}->[0]->[4], '_0$<i>0<i>3', 'Position');
done_testing;
diff --git a/t/annotation/corenlp_morpho.t b/t/annotation/corenlp_morpho.t
index ec6e518..cefc94e 100644
--- a/t/annotation/corenlp_morpho.t
+++ b/t/annotation/corenlp_morpho.t
@@ -15,7 +15,7 @@
my $data = $tokens->to_data->{data};
like($data->{foundries}, qr!corenlp/morpho!, 'data');
like($data->{layerInfos}, qr!corenlp/p=tokens!, 'data');
-is($data->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Text boundary');
+is($data->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>18<b>0', 'Text boundary');
is($data->{stream}->[0]->[3], 'corenlp/p:APPRART', 'POS');
is($data->{stream}->[1]->[1], 'corenlp/p:ADJ', 'POS');
is($data->{stream}->[2]->[1], 'corenlp/p:ADJA', 'POS');
diff --git a/t/annotation/corenlp_sentences.t b/t/annotation/corenlp_sentences.t
index b9e9b75..09611c6 100644
--- a/t/annotation/corenlp_sentences.t
+++ b/t/annotation/corenlp_sentences.t
@@ -17,8 +17,8 @@
like($data->{foundries}, qr!corenlp/sentences!, 'data');
is($data->{stream}->[0]->[0], '-:corenlp/sentences$<i>1', 'Number of paragraphs');
is($data->{stream}->[0]->[1], '-:tokens$<i>18', 'Number of tokens');
-is($data->{stream}->[0]->[2], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Text boundary');
-is($data->{stream}->[0]->[3], '<>:corenlp/s:s$<b>64<i>0<i>129<i>17<b>0', 'Text');
+is($data->{stream}->[0]->[2], '<>:base/s:t$<b>64<i>0<i>129<i>18<b>0', 'Text boundary');
+is($data->{stream}->[0]->[3], '<>:corenlp/s:s$<b>64<i>0<i>129<i>18<b>0', 'Text');
is($data->{stream}->[0]->[4], '_0$<i>0<i>3', 'Position');
is($data->{stream}->[-1]->[0], '_17$<i>124<i>128', 'Position');
diff --git a/t/annotation/dereko_struct.t b/t/annotation/dereko_struct.t
index 4e00bb3..921b58e 100644
--- a/t/annotation/dereko_struct.t
+++ b/t/annotation/dereko_struct.t
@@ -23,7 +23,7 @@
'Empty element');
-is($data->{stream}->[0]->[5], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Text boundary');
+is($data->{stream}->[0]->[5], '<>:base/s:t$<b>64<i>0<i>129<i>18<b>0', 'Text boundary');
# Attributes:
is($data->{stream}->[0]->[11],
diff --git a/t/annotation/glemm_morpho.t b/t/annotation/glemm_morpho.t
index 208d9d1..0ac9705 100644
--- a/t/annotation/glemm_morpho.t
+++ b/t/annotation/glemm_morpho.t
@@ -17,7 +17,7 @@
like($data->{foundries}, qr!glemm/morpho!, 'data');
like($data->{layerInfos}, qr!glemm/l=tokens!, 'data');
-is($data->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Text boundary');
+is($data->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>18<b>0', 'Text boundary');
is($data->{stream}->[0]->[3], 'glemm/l:__zu', 'Lemma');
is($data->{stream}->[1]->[1], 'glemm/l:__letzt-', 'Lemma');
diff --git a/t/annotation/malt_dependency.t b/t/annotation/malt_dependency.t
index f58204f..515ae8f 100644
--- a/t/annotation/malt_dependency.t
+++ b/t/annotation/malt_dependency.t
@@ -51,7 +51,7 @@
is($stream->[0]->[5], '>:malt/d:ROOT$<b>33<i>0<i>49<i>0<i>6', 'Term2Term relation');
# Text element
-is($stream->[0]->[4], '<>:base/s:t$<b>64<i>0<i>238<i>30<b>0', 'Text element');
+is($stream->[0]->[4], '<>:base/s:t$<b>64<i>0<i>238<i>31<b>0', 'Text element');
done_testing;
__END__
diff --git a/t/annotation/mate_morpho.t b/t/annotation/mate_morpho.t
index 736fd31..9798ba0 100644
--- a/t/annotation/mate_morpho.t
+++ b/t/annotation/mate_morpho.t
@@ -19,7 +19,7 @@
like($data->{layerInfos}, qr!mate/l=tokens!, 'data');
like($data->{layerInfos}, qr!mate/m=tokens!, 'data');
-is($data->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Text boundary');
+is($data->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>18<b>0', 'Text boundary');
is($data->{stream}->[0]->[4], 'mate/l:zu', 'POS');
is($data->{stream}->[0]->[5], 'mate/m:case:dat', 'POS');
is($data->{stream}->[0]->[6], 'mate/m:gender:neut', 'POS');
diff --git a/t/annotation/mate_morpho_attr.t b/t/annotation/mate_morpho_attr.t
index b32c12c..e556d0f 100644
--- a/t/annotation/mate_morpho_attr.t
+++ b/t/annotation/mate_morpho_attr.t
@@ -18,7 +18,7 @@
like($data->{layerInfos}, qr!mate/p=tokens!, 'data');
like($data->{layerInfos}, qr!mate/l=tokens!, 'data');
-is($data->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Text boundary');
+is($data->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>18<b>0', 'Text boundary');
is($data->{stream}->[0]->[2], '@:gender=neut$<b>16<s>1', 'POS');
is($data->{stream}->[0]->[3], '@:number=sg$<b>16<s>1', 'POS');
is($data->{stream}->[0]->[4], '@:case=dat$<b>16<s>1', 'POS');
diff --git a/t/annotation/mdp_dependency.t b/t/annotation/mdp_dependency.t
index de9144c..2fd341f 100644
--- a/t/annotation/mdp_dependency.t
+++ b/t/annotation/mdp_dependency.t
@@ -82,8 +82,8 @@
is($stream->[0]->[10], '>:mdp/d:ROOT$<b>33<i>0<i>317<i>0<i>40', 'Term-to-Element');
-is($stream->[-1]->[0], '>:mdp/d:ROOT$<b>33<i>26130<i>26153<i>3553<i>3554', 'Term-to-Element');
-is($stream->[3553]->[1], '<:mdp/d:ROOT$<b>34<i>26130<i>26153<i>3554<i>3553', 'Element-to-Term');
+is($stream->[-1]->[0], '>:mdp/d:ROOT$<b>33<i>26130<i>26153<i>3553<i>3555', 'Term-to-Element');
+is($stream->[3553]->[1], '<:mdp/d:ROOT$<b>34<i>26130<i>26153<i>3555<i>3553', 'Element-to-Term');
done_testing;
__END__
diff --git a/t/annotation/opennlp_morpho.t b/t/annotation/opennlp_morpho.t
index c8bcbb7..64d9008 100644
--- a/t/annotation/opennlp_morpho.t
+++ b/t/annotation/opennlp_morpho.t
@@ -10,7 +10,7 @@
ok(my $tokens = TestInit::tokens('0001'), 'Parse tokens');
-is($tokens->stream->pos(0)->to_string, '[(0-3)-:tokens$<i>18|<>:base/s:t$<b>64<i>0<i>129<i>17<b>0|_0$<i>0<i>3|i:zum|s:Zum]', 'Token is correct');
+is($tokens->stream->pos(0)->to_string, '[(0-3)-:tokens$<i>18|<>:base/s:t$<b>64<i>0<i>129<i>18<b>0|_0$<i>0<i>3|i:zum|s:Zum]', 'Token is correct');
is($tokens->stream->pos(1)->to_string, '[(4-11)_1$<i>4<i>11|i:letzten|s:letzten]', 'Token is correct');
diff --git a/t/annotation/opennlp_sentences.t b/t/annotation/opennlp_sentences.t
index 16c8a65..02eac83 100644
--- a/t/annotation/opennlp_sentences.t
+++ b/t/annotation/opennlp_sentences.t
@@ -17,7 +17,7 @@
like($data->{foundries}, qr!opennlp/sentences!, 'data');
is($data->{stream}->[0]->[0], '-:opennlp/sentences$<i>1', 'Number of Sentences');
is($data->{stream}->[0]->[1], '-:tokens$<i>18', 'Number of tokens');
-is($data->{stream}->[0]->[3], '<>:opennlp/s:s$<b>64<i>0<i>129<i>17<b>0', 'Sentence');
+is($data->{stream}->[0]->[3], '<>:opennlp/s:s$<b>64<i>0<i>129<i>18<b>0', 'Sentence');
is($data->{stream}->[0]->[4], '_0$<i>0<i>3', 'Position');
done_testing;
diff --git a/t/annotation/tt_sentences.t b/t/annotation/tt_sentences.t
index 703f1a6..a7894b8 100644
--- a/t/annotation/tt_sentences.t
+++ b/t/annotation/tt_sentences.t
@@ -19,7 +19,7 @@
like($data->{foundries}, qr!treetagger/sentences!, 'data');
is($data->{stream}->[0]->[0], '-:tokens$<i>18', 'Number of tokens');
is($data->{stream}->[0]->[1], '-:tt/sentences$<i>1', 'Number of paragraphs');
-is($data->{stream}->[0]->[3], '<>:tt/s:s$<b>64<i>0<i>130<i>17<b>0', 'Text');
+is($data->{stream}->[0]->[3], '<>:tt/s:s$<b>64<i>0<i>130<i>18<b>0', 'Text');
is($data->{stream}->[0]->[4], '_0$<i>0<i>3', 'Position');
is($data->{stream}->[-1]->[0], '_17$<i>124<i>128', 'Position');
diff --git a/t/annotation/xip_constituency.t b/t/annotation/xip_constituency.t
index 026321f..0f8a0b8 100644
--- a/t/annotation/xip_constituency.t
+++ b/t/annotation/xip_constituency.t
@@ -19,8 +19,8 @@
# The length includes the punct - but that doesn't matter
is($data->{stream}->[0]->[1], '<>:xip/c:PREP$<b>64<i>0<i>3<i>1<b>3', 'Prep phrase');
is($data->{stream}->[0]->[2], '<>:xip/c:PP$<b>64<i>0<i>30<i>4<b>2', 'pp phrase');
-is($data->{stream}->[0]->[4], '<>:xip/c:TOP$<b>64<i>0<i>129<i>17<b>0', 'top phrase');
-is($data->{stream}->[0]->[5], '<>:xip/c:MC$<b>64<i>0<i>129<i>17<b>1', 'mc phrase');
+is($data->{stream}->[0]->[4], '<>:xip/c:TOP$<b>64<i>0<i>129<i>18<b>0', 'top phrase');
+is($data->{stream}->[0]->[5], '<>:xip/c:MC$<b>64<i>0<i>129<i>18<b>1', 'mc phrase');
is($data->{stream}->[-1]->[0], '<>:xip/c:VERB$<b>64<i>124<i>128<i>18<b>4', 'Noun phrase');
diff --git a/t/annotation/xip_sentences.t b/t/annotation/xip_sentences.t
index 3f2c62c..88b3e21 100644
--- a/t/annotation/xip_sentences.t
+++ b/t/annotation/xip_sentences.t
@@ -18,7 +18,7 @@
is($data->{stream}->[0]->[0], '-:tokens$<i>18', 'Number of tokens');
is($data->{stream}->[0]->[1], '-:xip/sentences$<i>1', 'Number of paragraphs');
-is($data->{stream}->[0]->[3], '<>:xip/s:s$<b>64<i>0<i>129<i>17<b>0', 'Text');
+is($data->{stream}->[0]->[3], '<>:xip/s:s$<b>64<i>0<i>129<i>18<b>0', 'Text');
is($data->{stream}->[0]->[4], '_0$<i>0<i>3', 'Position');
is($data->{stream}->[-1]->[0], '_17$<i>124<i>128', 'Position');
diff --git a/t/batch_file.t b/t/batch_file.t
index ed32401..55c4d2c 100644
--- a/t/batch_file.t
+++ b/t/batch_file.t
@@ -35,7 +35,7 @@
is($json->{data}->{foundries}, '', 'Foundries');
like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
is($json->{data}->{stream}->[0]->[0], '-:tokens$<i>18', 'Tokens');
-is($json->{data}->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Data');
+is($json->{data}->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>18<b>0', 'Data');
# Generate with Gzip
$bf->{gzip} = 1;
@@ -56,7 +56,7 @@
is($json->{data}->{foundries}, '', 'Foundries');
like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
is($json->{data}->{stream}->[0]->[0], '-:tokens$<i>18', 'Tokens');
-is($json->{data}->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Data');
+is($json->{data}->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>18<b>0', 'Data');
# Generate with annotations
$bf->{gzip} = 0;
@@ -80,7 +80,7 @@
like($json->{data}->{text}, qr/Ende Schuljahr eingestellt wird\.$/, 'Primary text');
-is($token->[1], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'base/s');
+is($token->[1], '<>:base/s:t$<b>64<i>0<i>129<i>18<b>0', 'base/s');
is($token->[2], '_0$<i>0<i>3', 'position');
is($token->[3], 'corenlp/p:APPRART', 'corenlp');
is($token->[5], 'opennlp/p:APPRART', 'opennlp');
diff --git a/t/real/goethe-2.t b/t/real/goethe-2.t
index ff3cba4..691889d 100644
--- a/t/real/goethe-2.t
+++ b/t/real/goethe-2.t
@@ -138,7 +138,7 @@
like($first_token, qr/s:Autobiographische/, 'data');
like($first_token, qr/_0\$<i>0<i>17/, 'data');
like($first_token, qr!<>:dereko/s:s\$<b>64<i>0<i>30<i>2<b>4!, 'data');
-like($first_token, qr!<>:base\/s:t\$<b>64<i>0<i>35242<i>5233<b>0!, 'data');
+like($first_token, qr!<>:base\/s:t\$<b>64<i>0<i>35242<i>5234<b>0!, 'data');
# like($first_token, qr!<>:base\/s:t\$<b>64<i>0<i>35250<i>5233<b>0!, 'data');
like($first_token, qr!<>:base/s:s\$<b>64<i>0<i>30<i>2<b>2!, 'data');
like($first_token, qr!-:base\/paragraphs\$\<i\>14!, 'data');
diff --git a/t/real/goethe.t b/t/real/goethe.t
index 5bc5eb2..03af7cc 100644
--- a/t/real/goethe.t
+++ b/t/real/goethe.t
@@ -138,7 +138,7 @@
like($first_token, qr/s:Autobiographische/, 'data');
like($first_token, qr/_0\$<i>0<i>17/, 'data');
like($first_token, qr!<>:base/s:s\$<b>64<i>0<i>30<i>2<b>2!, 'data');
-like($first_token, qr!<>:base\/s:t\$<b>64<i>0<i>35199<i>5226<b>0!, 'data');
+like($first_token, qr!<>:base\/s:t\$<b>64<i>0<i>35199<i>5227<b>0!, 'data');
## OpenNLP
$tokens->add('OpenNLP', 'Sentences');
diff --git a/t/real/hnc.t b/t/real/hnc.t
index 9aca507..5e44f6b 100644
--- a/t/real/hnc.t
+++ b/t/real/hnc.t
@@ -88,8 +88,9 @@
my $output = decode_json( $tokens->to_json );
-is($output->{data}->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>4368<i>577<b>0', 't');
+is($output->{data}->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>4368<i>578<b>0', 't');
is($output->{data}->{stream}->[0]->[3], 'i:addbot', 't');
+is($output->{data}->{stream}->[-1]->[0], '_577$<i>4359<i>4368', 't');
## Base
diff --git a/t/real/redew.t b/t/real/redew.t
index 25f369c..a5fe45b 100644
--- a/t/real/redew.t
+++ b/t/real/redew.t
@@ -114,15 +114,15 @@
my $first = $output->{data}->{stream}->[0];
is('-:tokens$<i>13',$first->[0]);
-is('<>:base/s:t$<b>64<i>0<i>197<i>12<b>0',$first->[1]);
-is('<>:dereko/s:text$<b>64<i>0<i>197<i>12<b>0',$first->[2]);
-is('<>:dereko/s:body$<b>64<i>118<i>197<i>12<b>1',$first->[3]);
-is('<>:dereko/s:p$<b>64<i>118<i>197<i>12<b>2',$first->[4]);
-is('<>:dereko/s:said$<b>64<i>118<i>197<i>12<b>3<s>1',$first->[5]);
-is('@:dereko/s:level:1$<b>17<s>1<i>12',$first->[6]);
-is('@:dereko/s:content:speech$<b>17<s>1<i>12',$first->[7]);
-is('@:dereko/s:mode:direct$<b>17<s>1<i>12',$first->[8]);
-is('@:dereko/s:id:1$<b>17<s>1<i>12',$first->[9]);
+is('<>:base/s:t$<b>64<i>0<i>197<i>13<b>0',$first->[1]);
+is('<>:dereko/s:text$<b>64<i>0<i>197<i>13<b>0',$first->[2]);
+is('<>:dereko/s:body$<b>64<i>118<i>197<i>13<b>1',$first->[3]);
+is('<>:dereko/s:p$<b>64<i>118<i>197<i>13<b>2',$first->[4]);
+is('<>:dereko/s:said$<b>64<i>118<i>197<i>13<b>3<s>1',$first->[5]);
+is('@:dereko/s:level:1$<b>17<s>1<i>13',$first->[6]);
+is('@:dereko/s:content:speech$<b>17<s>1<i>13',$first->[7]);
+is('@:dereko/s:mode:direct$<b>17<s>1<i>13',$first->[8]);
+is('@:dereko/s:id:1$<b>17<s>1<i>13',$first->[9]);
is('_0$<i>123<i>128',$first->[10]);
is("drukola/l:H\x{f6}rst",$first->[11]);
is('drukola/m:msd:rfpos',$first->[12]);
@@ -135,11 +135,11 @@
my $nine = join(',', @{$output->{data}->{stream}->[9]});
like($nine, qr{drukola\/l:nichts}, 'Nichts');
like($nine, qr{_9\$<i>170<i>176}, 'Term boundaries');
-unlike($nine, qr{<>:dereko/s:said\$<b>64<i>176<i>196<i>12<b>4<s>1}, 'Term boundaries');
+unlike($nine, qr{<>:dereko/s:said\$<b>64<i>176<i>196<i>13<b>4<s>1}, 'Term boundaries');
my $ten = join(',', @{$output->{data}->{stream}->[10]});
like($ten, qr{_10\$<i>177<i>180}, 'Term boundaries');
-like($ten, qr{<>:dereko/s:said\$<b>64<i>176<i>196<i>12<b>4<s>1}, 'Term boundaries');
+like($ten, qr{<>:dereko/s:said\$<b>64<i>176<i>196<i>13<b>4<s>1}, 'Term boundaries');
my $eleven = join(',', @{$output->{data}->{stream}->[11]});
like($eleven, qr{_11\$<i>181<i>188}, 'Term boundaries');
diff --git a/t/real/rei.t b/t/real/rei.t
index d50a732..e08516d 100644
--- a/t/real/rei.t
+++ b/t/real/rei.t
@@ -209,7 +209,7 @@
is('<>:dereko/s:docTitle$<b>64<i>0<i>91<i>11<b>3', $first->[15]);
is('<>:dereko/s:titlePart$<b>64<i>0<i>91<i>11<b>4<s>2', $first->[16]);
is('<>:dereko/s:s$<b>64<i>0<i>91<i>11<b>5', $first->[17]);
-is('<>:base/s:t$<b>64<i>0<i>17859<i>2640<b>0', $first->[18]);
+is('<>:base/s:t$<b>64<i>0<i>17859<i>2641<b>0', $first->[18]);
is('>:malt/d:ROOT$<b>33<i>0<i>48<i>0<i>7', $first->[19]);
is('<:malt/d:PP$<b>32<i>1', $first->[20]);
is('<:malt/d:ROOT$<b>34<i>0<i>48<i>7<i>0', $first->[21]);
@@ -298,8 +298,8 @@
is('<>:dereko/s:titlePage$<b>64<i>0<i>61<i>8<b>2<s>1', $first->[17]);
is('<>:dereko/s:docTitle$<b>64<i>0<i>61<i>8<b>3', $first->[18]);
is('<>:opennlp/s:s$<b>64<i>0<i>173<i>24<b>0', $first->[19]);
-is('<>:base/s:t$<b>64<i>0<i>7008<i>1008<b>0', $first->[20]);
-is('<>:dereko/s:text$<b>64<i>0<i>7008<i>1008<b>0', $first->[21]);
+is('<>:base/s:t$<b>64<i>0<i>7008<i>1009<b>0', $first->[20]);
+is('<>:dereko/s:text$<b>64<i>0<i>7008<i>1009<b>0', $first->[21]);
is('>:malt/d:GMOD$<b>32<i>3', $first->[22]);
is('<:malt/d:ROOT$<b>34<i>0<i>51<i>6<i>3', $first->[23]);
is('@:dereko/s:id:bng.00071-0-titlepage$<b>17<s>1<i>8', $first->[24]);
@@ -313,16 +313,17 @@
is('tt/p:NE', $first->[32]);
$last = $output->{data}->{stream}->[-1];
-is('<>:dereko/s:back$<b>65<i>7008<i>7008<i>1008<b>1', $last->[0]);
-is('<>:dereko/s:div$<b>65<i>7008<i>7008<i>1008<b>2<s>1', $last->[1]);
-is('@:dereko/s:n:1$<b>17<s>1', $last->[2]);
-is('@:dereko/s:type:footnotes$<b>17<s>1', $last->[3]);
-is('@:dereko/s:complete:y$<b>17<s>1', $last->[4]);
-is('_1008$<i>6990<i>7006', $last->[5]);
-is('corenlp/p:NN', $last->[6]);
-is('i:befreiungsschlag', $last->[7]);
-is('opennlp/p:NN', $last->[8]);
-is('s:Befreiungsschlag', $last->[9]);
+# No longer indexed:
+#is('<>:dereko/s:back$<b>65<i>7008<i>7008<i>1009<b>1', $last->[0]);
+#is('<>:dereko/s:div$<b>65<i>7008<i>7008<i>1009<b>2<s>1', $last->[1]);
+#is('@:dereko/s:n:1$<b>17<s>1', $last->[2]);
+#is('@:dereko/s:type:footnotes$<b>17<s>1', $last->[3]);
+#is('@:dereko/s:complete:y$<b>17<s>1', $last->[4]);
+is('_1008$<i>6990<i>7006', $last->[0]);
+is('corenlp/p:NN', $last->[1]);
+is('i:befreiungsschlag', $last->[2]);
+is('opennlp/p:NN', $last->[3]);
+is('s:Befreiungsschlag', $last->[4]);
done_testing;
__END__
diff --git a/t/script/base.t b/t/script/base.t
index 08baa64..7bfe48d 100644
--- a/t/script/base.t
+++ b/t/script/base.t
@@ -58,7 +58,7 @@
is($token->[5], '<>:base/s:s$<b>64<i>0<i>30<i>2<b>2', 'struct');
is($token->[7], '<>:dereko/s:s$<b>64<i>0<i>30<i>2<b>4', 'struct');
-is($token->[8], '<>:base/s:t$<b>64<i>0<i>35242<i>5238<b>0', 'struct');
+is($token->[8], '<>:base/s:t$<b>64<i>0<i>35242<i>5239<b>0', 'struct');
$token = $stream->[4];
is($token->[0], '<>:base/s:s$<b>64<i>53<i>254<i>32<b>2', 'struct');
diff --git a/t/sgbr/base.t b/t/sgbr/base.t
index 718a0ba..073209e 100644
--- a/t/sgbr/base.t
+++ b/t/sgbr/base.t
@@ -32,8 +32,8 @@
is($stream->[0]->[0], '-:base/sentences$<i>1');
is($stream->[0]->[1], '-:tokens$<i>15');
-is($stream->[0]->[2], '<>:base/s:t$<b>64<i>0<i>115<i>14<b>0');
-is($stream->[0]->[3], '<>:base/s:s$<b>64<i>16<i>114<i>14<b>2');
+is($stream->[0]->[2], '<>:base/s:t$<b>64<i>0<i>115<i>15<b>0');
+is($stream->[0]->[3], '<>:base/s:s$<b>64<i>16<i>114<i>15<b>2');
is($stream->[0]->[4], '_0$<i>17<i>18');
done_testing;
diff --git a/t/sgbr/lemma.t b/t/sgbr/lemma.t
index 3221dba..6635175 100644
--- a/t/sgbr/lemma.t
+++ b/t/sgbr/lemma.t
@@ -32,7 +32,7 @@
my $stream = $data->{stream};
is($stream->[0]->[0], '-:tokens$<i>51', 'Token number');
-is($stream->[0]->[1], '<>:base/s:t$<b>64<i>0<i>365<i>50<b>0', 'Text Boundary');
+is($stream->[0]->[1], '<>:base/s:t$<b>64<i>0<i>365<i>51<b>0', 'Text Boundary');
is($stream->[0]->[2], '_0$<i>0<i>18', 'Position');
is($stream->[0]->[3], 'i:sommerüberraschung', 'First term');
is($stream->[0]->[4], 's:Sommerüberraschung', 'First term');
diff --git a/t/sgbr/pos.t b/t/sgbr/pos.t
index 0163ed1..d750cb1 100644
--- a/t/sgbr/pos.t
+++ b/t/sgbr/pos.t
@@ -33,7 +33,7 @@
my $stream = $data->{stream};
is($stream->[0]->[0], '-:tokens$<i>51', 'Token number');
-is($stream->[0]->[1], '<>:base/s:t$<b>64<i>0<i>365<i>50<b>0', 'Text boundary');
+is($stream->[0]->[1], '<>:base/s:t$<b>64<i>0<i>365<i>51<b>0', 'Text boundary');
is($stream->[0]->[2], '_0$<i>0<i>18', 'Position');
is($stream->[0]->[3], 'i:sommerüberraschung', 'First term');
is($stream->[0]->[4], 's:Sommerüberraschung', 'First term');
diff --git a/t/tokenization.t b/t/tokenization.t
index eecb1d7..da135e5 100644
--- a/t/tokenization.t
+++ b/t/tokenization.t
@@ -73,7 +73,8 @@
is($json->{data}->{name}, 'tokens');
is($json->{data}->{tokenSource}, 'opennlp#tokens');
-is($json->{data}->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>6083<i>1067<b>0');
+is($json->{data}->{stream}->[0]->[0], '-:tokens$<i>1068');
+is($json->{data}->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>6083<i>1068<b>0');
done_testing;
diff --git a/t/transform.t b/t/transform.t
index 97c5a8b..c071f81 100644
--- a/t/transform.t
+++ b/t/transform.t
@@ -133,7 +133,7 @@
is_deeply(
_t2h($tokens->stream->pos(0)->to_string),
- _t2h('[(0-1)s:A|i:a|_0$<i>0<i>1|-:tokens$<i>923|mate/p:XY|<>:base/s:s$<b>64<i>0<i>74<i>13<b>2|<>:base/s:t$<b>64<i>0<i>6083<i>922<b>0|-:base/sentences$<i>96]'),
+ _t2h('[(0-1)s:A|i:a|_0$<i>0<i>1|-:tokens$<i>923|mate/p:XY|<>:base/s:s$<b>64<i>0<i>74<i>13<b>2|<>:base/s:t$<b>64<i>0<i>6083<i>923<b>0|-:base/sentences$<i>96]'),
'Startinfo'
);
@@ -146,7 +146,7 @@
'-:tokens$<i>923|'.
'mate/p:XY|'.
'<>:base/s:s$<b>64<i>0<i>74<i>13<b>2|'.
- '<>:base/s:t$<b>64<i>0<i>6083<i>922<b>0|'.
+ '<>:base/s:t$<b>64<i>0<i>6083<i>923<b>0|'.
'-:base/sentences$<i>96|'.
'<>:base/s:p$<b>64<i>0<i>224<i>34<b>1|'.
'-:base/paragraphs$<i>76|'.