Added base/s=t during tokenization (2)
Change-Id: I005be1ef836aa1016219c63c45b4fd2bb0431ffa
diff --git a/lib/KorAP/XML/Index/Base/Sentences.pm b/lib/KorAP/XML/Index/Base/Sentences.pm
index 1ca71d7..5766e8b 100644
--- a/lib/KorAP/XML/Index/Base/Sentences.pm
+++ b/lib/KorAP/XML/Index/Base/Sentences.pm
@@ -28,15 +28,15 @@
}
) or return;
- my $mt = $$self->stream->pos($first->[0]);
- $mt->add(
- term => '<>:base/s:t',
- o_start => $first->[1],
- p_end => $last_p,
- o_end => $last_o,
- payload => '<b>0',
- pti => 64
- );
+# my $mt = $$self->stream->pos($first->[0]);
+# $mt->add(
+# term => '<>:base/s:t',
+# o_start => $first->[1],
+# p_end => $last_p,
+# o_end => $last_o,
+# payload => '<b>0',
+# pti => 64
+# );
$$self->stream->add_meta('base/sentences', '<i>' . $i);
diff --git a/t/index/xip_constituency.t b/t/index/xip_constituency.t
index 17e6c7e..afd1e2a 100644
--- a/t/index/xip_constituency.t
+++ b/t/index/xip_constituency.t
@@ -18,14 +18,12 @@
# The length includes the punct - but that doesn't matter
is($data->{stream}->[0]->[1], '<>:xip/c:PREP$<b>64<i>0<i>3<i>1<b>3', 'Prep phrase');
-is($data->{stream}->[0]->[4], '<>:xip/c:PP$<b>64<i>0<i>30<i>4<b>2', 'pp phrase');
-done_testing;
-__END__
-is($data->{stream}->[0]->[6], '<>:xip/c:TOP$<b>64<i>0<i>129<i>17<b>0', 'top phrase');
-is($data->{stream}->[0]->[7], '<>:xip/c:MC$<b>64<i>0<i>129<i>17<b>1', 'mc phrase');
-
+is($data->{stream}->[0]->[2], '<>:xip/c:PP$<b>64<i>0<i>30<i>4<b>2', 'pp phrase');
+is($data->{stream}->[0]->[4], '<>:xip/c:TOP$<b>64<i>0<i>129<i>17<b>0', 'top phrase');
+is($data->{stream}->[0]->[5], '<>:xip/c:MC$<b>64<i>0<i>129<i>17<b>1', 'mc phrase');
is($data->{stream}->[-1]->[0], '<>:xip/c:VERB$<b>64<i>124<i>128<i>18<b>4', 'Noun phrase');
+
done_testing;
__END__
diff --git a/t/index/xip_morpho.t b/t/index/xip_morpho.t
index 01aed9e..db59ef3 100644
--- a/t/index/xip_morpho.t
+++ b/t/index/xip_morpho.t
@@ -17,8 +17,8 @@
like($data->{foundries}, qr!xip/morpho!, 'data');
like($data->{layerInfos}, qr!xip/l=tokens!, 'data');
like($data->{layerInfos}, qr!xip/p=tokens!, 'data');
-is($data->{stream}->[0]->[4], 'xip/l:zu', 'Lemma');
-is($data->{stream}->[0]->[5], 'xip/p:PREP', 'POS');
+is($data->{stream}->[0]->[5], 'xip/l:zu', 'Lemma');
+is($data->{stream}->[0]->[6], 'xip/p:PREP', 'POS');
is($data->{stream}->[1]->[3], 'xip/l:letzt', 'Lemma');
is($data->{stream}->[1]->[4], 'xip/p:ADJ', 'POS');
diff --git a/t/index/xip_sentences.t b/t/index/xip_sentences.t
index b9f5a66..896422e 100644
--- a/t/index/xip_sentences.t
+++ b/t/index/xip_sentences.t
@@ -16,10 +16,10 @@
like($data->{foundries}, qr!xip/sentences!, 'data');
-is($data->{stream}->[0]->[1], '-:xip/sentences$<i>1', 'Number of paragraphs');
is($data->{stream}->[0]->[0], '-:tokens$<i>18', 'Number of tokens');
-is($data->{stream}->[0]->[2], '<>:xip/s:s$<b>64<i>0<i>129<i>17<b>0', 'Text');
-is($data->{stream}->[0]->[3], '_0$<i>0<i>3', 'Position');
+is($data->{stream}->[0]->[1], '-:xip/sentences$<i>1', 'Number of paragraphs');
+is($data->{stream}->[0]->[3], '<>:xip/s:s$<b>64<i>0<i>129<i>17<b>0', 'Text');
+is($data->{stream}->[0]->[4], '_0$<i>0<i>3', 'Position');
is($data->{stream}->[-1]->[0], '_17$<i>124<i>128', 'Position');
done_testing;
diff --git a/t/real/bzk.t b/t/real/bzk.t
index acaf668..196a420 100644
--- a/t/real/bzk.t
+++ b/t/real/bzk.t
@@ -89,7 +89,7 @@
is($output->{version}, '0.03', 'version');
is($output->{data}->{foundries}, '', 'Foundries');
is($output->{data}->{layerInfos}, '', 'layerInfos');
-is($output->{data}->{stream}->[0]->[3], 's:unser', 'data');
+is($output->{data}->{stream}->[0]->[4], 's:unser', 'data');
is($output->{textSigle}, 'BZK_D59.00001', 'Correct text sigle');
is($output->{docSigle}, 'BZK_D59', 'Correct document sigle');
diff --git a/t/real/bzk_2.t b/t/real/bzk_2.t
index e4995c9..4b958a2 100644
--- a/t/real/bzk_2.t
+++ b/t/real/bzk_2.t
@@ -90,7 +90,7 @@
is($output->{version}, '0.03', 'version');
is($output->{data}->{foundries}, '', 'Foundries');
is($output->{data}->{layerInfos}, '', 'layerInfos');
-is($output->{data}->{stream}->[0]->[3], 's:Saragat-Partei', 'data');
+is($output->{data}->{stream}->[0]->[4], 's:Saragat-Partei', 'data');
is($output->{textSigle}, 'BZK_D59.00089', 'Correct text sigle');
is($output->{docSigle}, 'BZK_D59', 'Correct document sigle');
diff --git a/t/real/goethe.t b/t/real/goethe.t
index 84f239f..eeefb39 100644
--- a/t/real/goethe.t
+++ b/t/real/goethe.t
@@ -86,7 +86,7 @@
is($output->{version}, '0.03', 'version');
is($output->{data}->{foundries}, '', 'Foundries');
is($output->{data}->{layerInfos}, '', 'layerInfos');
-is($output->{data}->{stream}->[0]->[3], 's:Autobiographische', 'data');
+is($output->{data}->{stream}->[0]->[4], 's:Autobiographische', 'data');
is($output->{textSigle}, 'GOE_AGA.03828', 'Correct text sigle');
is($output->{docSigle}, 'GOE_AGA', 'Correct document sigle');
diff --git a/t/real/wdd.t b/t/real/wdd.t
index 88e36de..41e8e60 100644
--- a/t/real/wdd.t
+++ b/t/real/wdd.t
@@ -81,7 +81,7 @@
is($output->{version}, '0.03', 'version');
is($output->{data}->{foundries}, '', 'Foundries');
is($output->{data}->{layerInfos}, '', 'layerInfos');
-is($output->{data}->{stream}->[0]->[3], 's:{War', 'data');
+is($output->{data}->{stream}->[0]->[4], 's:{War', 'data');
is($output->{textSigle}, 'WDD11_G27.38989', 'Correct text sigle');
is($output->{docSigle}, 'WDD11_G27', 'Correct document sigle');
diff --git a/t/sgbr/lemma.t b/t/sgbr/lemma.t
index 87f1d83..3221dba 100644
--- a/t/sgbr/lemma.t
+++ b/t/sgbr/lemma.t
@@ -32,11 +32,12 @@
my $stream = $data->{stream};
is($stream->[0]->[0], '-:tokens$<i>51', 'Token number');
-is($stream->[0]->[1], '_0$<i>0<i>18', 'Position');
-is($stream->[0]->[2], 'i:sommerüberraschung', 'First term');
-is($stream->[0]->[3], 's:Sommerüberraschung', 'First term');
-is($stream->[0]->[4], 'sgbr/l:Sommerüberraschung', 'First term');
-ok(!defined $stream->[0]->[5], 'First term');
+is($stream->[0]->[1], '<>:base/s:t$<b>64<i>0<i>365<i>50<b>0', 'Text Boundary');
+is($stream->[0]->[2], '_0$<i>0<i>18', 'Position');
+is($stream->[0]->[3], 'i:sommerüberraschung', 'First term');
+is($stream->[0]->[4], 's:Sommerüberraschung', 'First term');
+is($stream->[0]->[5], 'sgbr/l:Sommerüberraschung', 'First term');
+ok(!defined $stream->[0]->[6], 'First term');
is($stream->[1]->[0], '_1$<i>19<i>21', 'Position');
is($stream->[1]->[1], 'i:es', 'Second term');
diff --git a/t/sgbr/pos.t b/t/sgbr/pos.t
index e87a1d7..0163ed1 100644
--- a/t/sgbr/pos.t
+++ b/t/sgbr/pos.t
@@ -33,10 +33,11 @@
my $stream = $data->{stream};
is($stream->[0]->[0], '-:tokens$<i>51', 'Token number');
-is($stream->[0]->[1], '_0$<i>0<i>18', 'Position');
-is($stream->[0]->[2], 'i:sommerüberraschung', 'First term');
-is($stream->[0]->[3], 's:Sommerüberraschung', 'First term');
-is($stream->[0]->[4], 'sgbr/p:NN', 'First term POS');
+is($stream->[0]->[1], '<>:base/s:t$<b>64<i>0<i>365<i>50<b>0', 'Text boundary');
+is($stream->[0]->[2], '_0$<i>0<i>18', 'Position');
+is($stream->[0]->[3], 'i:sommerüberraschung', 'First term');
+is($stream->[0]->[4], 's:Sommerüberraschung', 'First term');
+is($stream->[0]->[5], 'sgbr/p:NN', 'First term POS');
is($stream->[1]->[3], 'sgbr/p:PPER', 'First term POS');
is($stream->[-1]->[3], 'sgbr/p:NE', 'Last term POS');
diff --git a/t/sgbr/token.t b/t/sgbr/token.t
index cafa3c4..da17f8a 100644
--- a/t/sgbr/token.t
+++ b/t/sgbr/token.t
@@ -31,9 +31,9 @@
my $stream = $data->{stream};
is($stream->[0]->[0], '-:tokens$<i>51', 'Token number');
-is($stream->[0]->[1], '_0$<i>0<i>18', 'Position');
-is($stream->[0]->[2], 'i:sommerüberraschung', 'First term');
-is($stream->[0]->[3], 's:Sommerüberraschung', 'First term');
+is($stream->[0]->[2], '_0$<i>0<i>18', 'Position');
+is($stream->[0]->[3], 'i:sommerüberraschung', 'First term');
+is($stream->[0]->[4], 's:Sommerüberraschung', 'First term');
is($stream->[-1]->[0], '_50$<i>359<i>364', 'Last position');
is($stream->[-1]->[1], 'i:kevin', 'Last term');
is($stream->[-1]->[2], 's:Kevin', 'Last term');
diff --git a/t/transform.t b/t/transform.t
index 379595e..9f4c842 100644
--- a/t/transform.t
+++ b/t/transform.t
@@ -133,7 +133,7 @@
is_deeply(
_t2h($tokens->stream->pos(0)->to_string),
- _t2h('[(0-1)s:A|i:a|_0$<i>0<i>1|-:tokens$<i>923|mate/p:XY|<>:base/s:s$<b>64<i>0<i>74<i>13<b>2|<>:base/s:t$<b>64<i>0<i>6083<i>923<b>0|-:base/sentences$<i>96]'),
+ _t2h('[(0-1)s:A|i:a|_0$<i>0<i>1|-:tokens$<i>923|mate/p:XY|<>:base/s:s$<b>64<i>0<i>74<i>13<b>2|<>:base/s:t$<b>64<i>0<i>6083<i>922<b>0|-:base/sentences$<i>96]'),
'Startinfo'
);
@@ -146,7 +146,7 @@
'-:tokens$<i>923|'.
'mate/p:XY|'.
'<>:base/s:s$<b>64<i>0<i>74<i>13<b>2|'.
- '<>:base/s:t$<b>64<i>0<i>6083<i>923<b>0|'.
+ '<>:base/s:t$<b>64<i>0<i>6083<i>922<b>0|'.
'-:base/sentences$<i>96|'.
'<>:base/s:p$<b>64<i>0<i>224<i>34<b>1|'.
'-:base/paragraphs$<i>76|'.