Added base/s=t during tokenization
Change-Id: I7f7f96bbdbc129ab273aa6e2ac8326f4f1207585
diff --git a/lib/KorAP/XML/Field/MultiTermToken.pm b/lib/KorAP/XML/Field/MultiTermToken.pm
index 52c3d20..3087e9d 100644
--- a/lib/KorAP/XML/Field/MultiTermToken.pm
+++ b/lib/KorAP/XML/Field/MultiTermToken.pm
@@ -207,7 +207,7 @@
return 1;
}
else {
- return 1;
+ return $a->[5] cmp $b->[5];
};
};
};
diff --git a/lib/KorAP/XML/Tokenizer.pm b/lib/KorAP/XML/Tokenizer.pm
index 7e589e7..e57527a 100644
--- a/lib/KorAP/XML/Tokenizer.pm
+++ b/lib/KorAP/XML/Tokenizer.pm
@@ -179,6 +179,16 @@
# Add token count
$mtts->add_meta('tokens', '<i>' . $have);
+ # Add text boundary
+ $mtts->pos(0)->add(
+ term => '<>:base/s:t',
+ o_start => 0,
+ p_end => ($have - 1),
+ o_end => $doc->primary->data_length,
+ payload => '<b>0',
+ pti => 64
+ );
+
# Create a gap for the
if ($doc->primary->data_length >= ($old - 1)) {
$range->gap($old, $doc->primary->data_length + 1, $have-1)
diff --git a/t/index/base_paragraphs.t b/t/index/base_paragraphs.t
index 7e606ba..71484cc 100644
--- a/t/index/base_paragraphs.t
+++ b/t/index/base_paragraphs.t
@@ -17,8 +17,9 @@
like($data->{foundries}, qr!base/paragraphs!, 'data');
is($data->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Number of paragraphs');
is($data->{stream}->[0]->[1], '-:tokens$<i>18', 'Number of tokens');
-is($data->{stream}->[0]->[2], '<>:base/s:p$<b>64<i>0<i>129<i>17<b>1', 'Paragraph');
-is($data->{stream}->[0]->[3], '_0$<i>0<i>3', 'Position');
+is($data->{stream}->[0]->[2], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Text');
+is($data->{stream}->[0]->[3], '<>:base/s:p$<b>64<i>0<i>129<i>17<b>1', 'Paragraph');
+is($data->{stream}->[0]->[4], '_0$<i>0<i>3', 'Position');
done_testing;
diff --git a/t/index/connexor_morpho.t b/t/index/connexor_morpho.t
index e977ebe..d0be68c 100644
--- a/t/index/connexor_morpho.t
+++ b/t/index/connexor_morpho.t
@@ -14,7 +14,8 @@
my $data = $tokens->to_data->{data};
like($data->{foundries}, qr!connexor/morpho!, 'data');
-is($data->{stream}->[0]->[1], '_0$<i>0<i>3', 'Position');
+is($data->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Text boundary');
+is($data->{stream}->[0]->[2], '_0$<i>0<i>3', 'Position');
is($data->{stream}->[1]->[1], 'cnx/l:letzt', 'Lemma');
is($data->{stream}->[1]->[2], 'cnx/p:A', 'POS');
is($data->{stream}->[2]->[1], 'cnx/l:kulturell', 'Lemma');
diff --git a/t/index/connexor_phrase.t b/t/index/connexor_phrase.t
index 1c218c6..2c1c933 100644
--- a/t/index/connexor_phrase.t
+++ b/t/index/connexor_phrase.t
@@ -15,6 +15,7 @@
my $data = $tokens->to_data->{data};
like($data->{foundries}, qr!connexor/phrase!, 'data');
+is($data->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Text boundary');
is($data->{stream}->[1]->[0], '<>:cnx/c:np$<b>64<i>4<i>30<i>4<b>0', 'Noun phrase');
done_testing;
diff --git a/t/index/connexor_sentences.t b/t/index/connexor_sentences.t
index 83a2fb6..f03cec0 100644
--- a/t/index/connexor_sentences.t
+++ b/t/index/connexor_sentences.t
@@ -42,8 +42,9 @@
like($data->{foundries}, qr!connexor/sentences!, 'data');
is($data->{stream}->[0]->[0], '-:cnx/sentences$<i>1', 'Number of paragraphs');
is($data->{stream}->[0]->[1], '-:tokens$<i>18', 'Number of tokens');
-is($data->{stream}->[0]->[2], '<>:cnx/s:s$<b>64<i>0<i>129<i>17<b>0', 'Sentence');
-is($data->{stream}->[0]->[3], '_0$<i>0<i>3', 'Position');
+is($data->{stream}->[0]->[2], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Text boundary');
+is($data->{stream}->[0]->[3], '<>:cnx/s:s$<b>64<i>0<i>129<i>17<b>0', 'Sentence');
+is($data->{stream}->[0]->[4], '_0$<i>0<i>3', 'Position');
done_testing;
diff --git a/t/index/corenlp_morpho.t b/t/index/corenlp_morpho.t
index 916053e..c9afdca 100644
--- a/t/index/corenlp_morpho.t
+++ b/t/index/corenlp_morpho.t
@@ -15,7 +15,8 @@
my $data = $tokens->to_data->{data};
like($data->{foundries}, qr!corenlp/morpho!, 'data');
like($data->{layerInfos}, qr!corenlp/p=tokens!, 'data');
-is($data->{stream}->[0]->[2], 'corenlp/p:APPRART', 'POS');
+is($data->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Text boundary');
+is($data->{stream}->[0]->[3], 'corenlp/p:APPRART', 'POS');
is($data->{stream}->[1]->[1], 'corenlp/p:ADJ', 'POS');
is($data->{stream}->[2]->[1], 'corenlp/p:ADJA', 'POS');
diff --git a/t/index/corenlp_sentences.t b/t/index/corenlp_sentences.t
index 4242010..0102748 100644
--- a/t/index/corenlp_sentences.t
+++ b/t/index/corenlp_sentences.t
@@ -17,8 +17,9 @@
like($data->{foundries}, qr!corenlp/sentences!, 'data');
is($data->{stream}->[0]->[0], '-:corenlp/sentences$<i>1', 'Number of paragraphs');
is($data->{stream}->[0]->[1], '-:tokens$<i>18', 'Number of tokens');
-is($data->{stream}->[0]->[2], '<>:corenlp/s:s$<b>64<i>0<i>129<i>17<b>0', 'Text');
-is($data->{stream}->[0]->[3], '_0$<i>0<i>3', 'Position');
+is($data->{stream}->[0]->[2], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Text boundary');
+is($data->{stream}->[0]->[3], '<>:corenlp/s:s$<b>64<i>0<i>129<i>17<b>0', 'Text');
+is($data->{stream}->[0]->[4], '_0$<i>0<i>3', 'Position');
is($data->{stream}->[-1]->[0], '_17$<i>124<i>128', 'Position');
done_testing;
diff --git a/t/index/dereko_struct.t b/t/index/dereko_struct.t
index f6f6f5e..4ecf608 100644
--- a/t/index/dereko_struct.t
+++ b/t/index/dereko_struct.t
@@ -22,13 +22,12 @@
'<>:dereko/s:idsHeader$<b>65<i>0<i>0<i>0<b>1<s>2',
'Empty element');
-# Attributes:
-is($data->{stream}->[0]->[10],
- '@:dereko/s:version:1.1$<b>17<s>2',
- 'Attribute of idsHeader');
+is($data->{stream}->[0]->[5], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Text boundary');
+
+# Attributes:
is($data->{stream}->[0]->[11],
- '@:dereko/s:TEIform:teiHeader$<b>17<s>2',
+ '@:dereko/s:type:text$<b>17<s>2',
'Attribute of idsHeader');
is($data->{stream}->[0]->[12],
@@ -36,9 +35,11 @@
'Attribute of idsHeader');
is($data->{stream}->[0]->[13],
- '@:dereko/s:type:text$<b>17<s>2',
+ '@:dereko/s:version:1.1$<b>17<s>2',
'Attribute of idsHeader');
+
+
is($data->{stream}->[0]->[14],
'@:dereko/s:pattern:text$<b>17<s>2',
'Attribute of idsHeader');
diff --git a/t/index/glemm_morpho.t b/t/index/glemm_morpho.t
index 8959277..34e70c5 100644
--- a/t/index/glemm_morpho.t
+++ b/t/index/glemm_morpho.t
@@ -17,7 +17,8 @@
like($data->{foundries}, qr!glemm/morpho!, 'data');
like($data->{layerInfos}, qr!glemm/l=tokens!, 'data');
-is($data->{stream}->[0]->[2], 'glemm/l:__zu', 'Lemma');
+is($data->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Text boundary');
+is($data->{stream}->[0]->[3], 'glemm/l:__zu', 'Lemma');
is($data->{stream}->[1]->[1], 'glemm/l:__letzt-', 'Lemma');
is($data->{stream}->[3]->[1], 'glemm/l:_+an-', 'Lemma');
diff --git a/t/index/mate_morpho.t b/t/index/mate_morpho.t
index 6400a42..1f5fb91 100644
--- a/t/index/mate_morpho.t
+++ b/t/index/mate_morpho.t
@@ -19,11 +19,12 @@
like($data->{layerInfos}, qr!mate/l=tokens!, 'data');
like($data->{layerInfos}, qr!mate/m=tokens!, 'data');
-is($data->{stream}->[0]->[3], 'mate/l:zu', 'POS');
-is($data->{stream}->[0]->[4], 'mate/m:case:dat', 'POS');
-is($data->{stream}->[0]->[5], 'mate/m:gender:neut', 'POS');
-is($data->{stream}->[0]->[6], 'mate/m:number:sg', 'POS');
-is($data->{stream}->[0]->[7], 'mate/p:APPRART', 'POS');
+is($data->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Text boundary');
+is($data->{stream}->[0]->[4], 'mate/l:zu', 'POS');
+is($data->{stream}->[0]->[5], 'mate/m:case:dat', 'POS');
+is($data->{stream}->[0]->[6], 'mate/m:gender:neut', 'POS');
+is($data->{stream}->[0]->[7], 'mate/m:number:sg', 'POS');
+is($data->{stream}->[0]->[8], 'mate/p:APPRART', 'POS');
is($data->{stream}->[-1]->[2], 'mate/l:werden', 'POS');
is($data->{stream}->[-1]->[3], 'mate/m:mood:ind', 'POS');
diff --git a/t/index/mate_morpho_attr.t b/t/index/mate_morpho_attr.t
index 792f139..04daffa 100644
--- a/t/index/mate_morpho_attr.t
+++ b/t/index/mate_morpho_attr.t
@@ -18,11 +18,12 @@
like($data->{layerInfos}, qr!mate/p=tokens!, 'data');
like($data->{layerInfos}, qr!mate/l=tokens!, 'data');
-is($data->{stream}->[0]->[1], '@:gender=neut$<b>16<s>1', 'POS');
-is($data->{stream}->[0]->[2], '@:number=sg$<b>16<s>1', 'POS');
-is($data->{stream}->[0]->[3], '@:case=dat$<b>16<s>1', 'POS');
-is($data->{stream}->[0]->[6], 'mate/l:zu', 'Lemmata');
-is($data->{stream}->[0]->[7], 'mate/p:APPRART$<b>128<s>1', 'POS');
+is($data->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Text boundary');
+is($data->{stream}->[0]->[2], '@:gender=neut$<b>16<s>1', 'POS');
+is($data->{stream}->[0]->[3], '@:number=sg$<b>16<s>1', 'POS');
+is($data->{stream}->[0]->[4], '@:case=dat$<b>16<s>1', 'POS');
+is($data->{stream}->[0]->[7], 'mate/l:zu', 'Lemmata');
+is($data->{stream}->[0]->[8], 'mate/p:APPRART$<b>128<s>1', 'POS');
is($data->{stream}->[-1]->[0], '@:mood=ind$<b>16<s>1', 'POS');
is($data->{stream}->[-1]->[1], '@:tense=pres$<b>16<s>1', 'POS');
diff --git a/t/index/opennlp_morpho.t b/t/index/opennlp_morpho.t
index cf57006..eb94a31 100644
--- a/t/index/opennlp_morpho.t
+++ b/t/index/opennlp_morpho.t
@@ -10,7 +10,7 @@
ok(my $tokens = TestInit::tokens('0001'), 'Parse tokens');
-is($tokens->stream->pos(0)->to_string, '[(0-3)-:tokens$<i>18|_0$<i>0<i>3|i:zum|s:Zum]', 'Token is correct');
+is($tokens->stream->pos(0)->to_string, '[(0-3)-:tokens$<i>18|<>:base/s:t$<b>64<i>0<i>129<i>17<b>0|_0$<i>0<i>3|i:zum|s:Zum]', 'Token is correct');
is($tokens->stream->pos(1)->to_string, '[(4-11)_1$<i>4<i>11|i:letzten|s:letzten]', 'Token is correct');
@@ -42,16 +42,13 @@
ok(!$tokens->stream->pos($i++), 'No more tokens');
-
-
-
ok($tokens->add('OpenNLP', 'Morpho'), 'Add Structure');
my $data = $tokens->to_data->{data};
like($data->{foundries}, qr!opennlp/morpho!, 'data');
-is($data->{stream}->[0]->[1], '_0$<i>0<i>3', 'Position');
-is($data->{stream}->[0]->[3], 'opennlp/p:APPRART', 'POS');
+is($data->{stream}->[0]->[2], '_0$<i>0<i>3', 'Position');
+is($data->{stream}->[0]->[4], 'opennlp/p:APPRART', 'POS');
is($data->{stream}->[1]->[2], 'opennlp/p:ADJA', 'POS');
is($data->{stream}->[2]->[2], 'opennlp/p:ADJA', 'POS');
is($data->{stream}->[-1]->[2], 'opennlp/p:VAFIN', 'POS');
diff --git a/t/index/opennlp_sentences.t b/t/index/opennlp_sentences.t
index 73808d6..98bb243 100644
--- a/t/index/opennlp_sentences.t
+++ b/t/index/opennlp_sentences.t
@@ -17,8 +17,8 @@
like($data->{foundries}, qr!opennlp/sentences!, 'data');
is($data->{stream}->[0]->[0], '-:opennlp/sentences$<i>1', 'Number of Sentences');
is($data->{stream}->[0]->[1], '-:tokens$<i>18', 'Number of tokens');
-is($data->{stream}->[0]->[2], '<>:opennlp/s:s$<b>64<i>0<i>129<i>17<b>0', 'Sentence');
-is($data->{stream}->[0]->[3], '_0$<i>0<i>3', 'Position');
+is($data->{stream}->[0]->[3], '<>:opennlp/s:s$<b>64<i>0<i>129<i>17<b>0', 'Sentence');
+is($data->{stream}->[0]->[4], '_0$<i>0<i>3', 'Position');
done_testing;
diff --git a/t/index/tt_morpho.t b/t/index/tt_morpho.t
index e529af8..5d50b38 100644
--- a/t/index/tt_morpho.t
+++ b/t/index/tt_morpho.t
@@ -18,8 +18,8 @@
like($data->{layerInfos}, qr!tt/p=tokens!, 'data');
like($data->{layerInfos}, qr!tt/l=tokens!, 'data');
-is($data->{stream}->[0]->[4], 'tt/l:zum$<b>129<b>255', 'POS');
-is($data->{stream}->[0]->[5], 'tt/p:APPRART$<b>129<b>255', 'POS');
+is($data->{stream}->[0]->[5], 'tt/l:zum$<b>129<b>255', 'POS');
+is($data->{stream}->[0]->[6], 'tt/p:APPRART$<b>129<b>255', 'POS');
is($data->{stream}->[3]->[3], 'tt/l:Anlaß$<b>129<b>255', 'POS');
is($data->{stream}->[3]->[4], 'tt/p:NN$<b>129<b>255', 'POS');
diff --git a/t/index/tt_sentences.t b/t/index/tt_sentences.t
index 04c5892..c47ebb6 100644
--- a/t/index/tt_sentences.t
+++ b/t/index/tt_sentences.t
@@ -19,8 +19,8 @@
like($data->{foundries}, qr!treetagger/sentences!, 'data');
is($data->{stream}->[0]->[0], '-:tokens$<i>18', 'Number of tokens');
is($data->{stream}->[0]->[1], '-:tt/sentences$<i>1', 'Number of paragraphs');
-is($data->{stream}->[0]->[2], '<>:tt/s:s$<b>64<i>0<i>130<i>17<b>0', 'Text');
-is($data->{stream}->[0]->[3], '_0$<i>0<i>3', 'Position');
+is($data->{stream}->[0]->[3], '<>:tt/s:s$<b>64<i>0<i>130<i>17<b>0', 'Text');
+is($data->{stream}->[0]->[4], '_0$<i>0<i>3', 'Position');
is($data->{stream}->[-1]->[0], '_17$<i>124<i>128', 'Position');
done_testing;
diff --git a/t/index/xip_constituency.t b/t/index/xip_constituency.t
index 89972e4..17e6c7e 100644
--- a/t/index/xip_constituency.t
+++ b/t/index/xip_constituency.t
@@ -18,9 +18,11 @@
# The length includes the punct - but that doesn't matter
is($data->{stream}->[0]->[1], '<>:xip/c:PREP$<b>64<i>0<i>3<i>1<b>3', 'Prep phrase');
-is($data->{stream}->[0]->[2], '<>:xip/c:PP$<b>64<i>0<i>30<i>4<b>2', 'pp phrase');
-is($data->{stream}->[0]->[3], '<>:xip/c:TOP$<b>64<i>0<i>129<i>17<b>0', 'top phrase');
-is($data->{stream}->[0]->[4], '<>:xip/c:MC$<b>64<i>0<i>129<i>17<b>1', 'mc phrase');
+is($data->{stream}->[0]->[4], '<>:xip/c:PP$<b>64<i>0<i>30<i>4<b>2', 'pp phrase');
+done_testing;
+__END__
+is($data->{stream}->[0]->[6], '<>:xip/c:TOP$<b>64<i>0<i>129<i>17<b>0', 'top phrase');
+is($data->{stream}->[0]->[7], '<>:xip/c:MC$<b>64<i>0<i>129<i>17<b>1', 'mc phrase');
is($data->{stream}->[-1]->[0], '<>:xip/c:VERB$<b>64<i>124<i>128<i>18<b>4', 'Noun phrase');