Added base/s=t during tokenization Change-Id: I7f7f96bbdbc129ab273aa6e2ac8326f4f1207585

commit: ee443f9a28cdb83f2590663bf993f06d70700bf1 [log] [tgz]
author: Akron <nils@diewald-online.de> Thu Feb 25 23:56:49 2016 +0100
committer: Akron <nils@diewald-online.de> Thu Feb 25 23:56:49 2016 +0100
tree: d4629db4d25887d1c645a2d698a919770d7db934
parent: 941c1a69f4d15a4b7fbf9add4b771906c4904bc0 [diff]
diff --git a/lib/KorAP/XML/Field/MultiTermToken.pm b/lib/KorAP/XML/Field/MultiTermToken.pm
index 52c3d20..3087e9d 100644
--- a/lib/KorAP/XML/Field/MultiTermToken.pm
+++ b/lib/KorAP/XML/Field/MultiTermToken.pm

@@ -207,7 +207,7 @@
 	return 1;
       }
       else {
-	return 1;
+	return $a->[5] cmp $b->[5];
       };
     };
   };

diff --git a/lib/KorAP/XML/Tokenizer.pm b/lib/KorAP/XML/Tokenizer.pm
index 7e589e7..e57527a 100644
--- a/lib/KorAP/XML/Tokenizer.pm
+++ b/lib/KorAP/XML/Tokenizer.pm

@@ -179,6 +179,16 @@
   # Add token count
   $mtts->add_meta('tokens', '<i>' . $have);
 
+  # Add text boundary
+  $mtts->pos(0)->add(
+    term => '<>:base/s:t',
+    o_start => 0,
+    p_end => ($have - 1),
+    o_end => $doc->primary->data_length,
+    payload => '<b>0',
+    pti => 64
+  );
+
   # Create a gap for the 
   if ($doc->primary->data_length >= ($old - 1)) {
     $range->gap($old, $doc->primary->data_length + 1, $have-1)

diff --git a/t/index/base_paragraphs.t b/t/index/base_paragraphs.t
index 7e606ba..71484cc 100644
--- a/t/index/base_paragraphs.t
+++ b/t/index/base_paragraphs.t

@@ -17,8 +17,9 @@
 like($data->{foundries}, qr!base/paragraphs!, 'data');
 is($data->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Number of paragraphs');
 is($data->{stream}->[0]->[1], '-:tokens$<i>18', 'Number of tokens');
-is($data->{stream}->[0]->[2], '<>:base/s:p$<b>64<i>0<i>129<i>17<b>1', 'Paragraph');
-is($data->{stream}->[0]->[3], '_0$<i>0<i>3', 'Position');
+is($data->{stream}->[0]->[2], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Text');
+is($data->{stream}->[0]->[3], '<>:base/s:p$<b>64<i>0<i>129<i>17<b>1', 'Paragraph');
+is($data->{stream}->[0]->[4], '_0$<i>0<i>3', 'Position');
 
 done_testing;
 

diff --git a/t/index/connexor_morpho.t b/t/index/connexor_morpho.t
index e977ebe..d0be68c 100644
--- a/t/index/connexor_morpho.t
+++ b/t/index/connexor_morpho.t

@@ -14,7 +14,8 @@
 
 my $data = $tokens->to_data->{data};
 like($data->{foundries}, qr!connexor/morpho!, 'data');
-is($data->{stream}->[0]->[1], '_0$<i>0<i>3', 'Position');
+is($data->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Text boundary');
+is($data->{stream}->[0]->[2], '_0$<i>0<i>3', 'Position');
 is($data->{stream}->[1]->[1], 'cnx/l:letzt', 'Lemma');
 is($data->{stream}->[1]->[2], 'cnx/p:A', 'POS');
 is($data->{stream}->[2]->[1], 'cnx/l:kulturell', 'Lemma');

diff --git a/t/index/connexor_phrase.t b/t/index/connexor_phrase.t
index 1c218c6..2c1c933 100644
--- a/t/index/connexor_phrase.t
+++ b/t/index/connexor_phrase.t

@@ -15,6 +15,7 @@
 my $data = $tokens->to_data->{data};
 
 like($data->{foundries}, qr!connexor/phrase!, 'data');
+is($data->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Text boundary');
 is($data->{stream}->[1]->[0], '<>:cnx/c:np$<b>64<i>4<i>30<i>4<b>0', 'Noun phrase');
 
 done_testing;

diff --git a/t/index/connexor_sentences.t b/t/index/connexor_sentences.t
index 83a2fb6..f03cec0 100644
--- a/t/index/connexor_sentences.t
+++ b/t/index/connexor_sentences.t

@@ -42,8 +42,9 @@
 like($data->{foundries}, qr!connexor/sentences!, 'data');
 is($data->{stream}->[0]->[0], '-:cnx/sentences$<i>1', 'Number of paragraphs');
 is($data->{stream}->[0]->[1], '-:tokens$<i>18', 'Number of tokens');
-is($data->{stream}->[0]->[2], '<>:cnx/s:s$<b>64<i>0<i>129<i>17<b>0', 'Sentence');
-is($data->{stream}->[0]->[3], '_0$<i>0<i>3', 'Position');
+is($data->{stream}->[0]->[2], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Text boundary');
+is($data->{stream}->[0]->[3], '<>:cnx/s:s$<b>64<i>0<i>129<i>17<b>0', 'Sentence');
+is($data->{stream}->[0]->[4], '_0$<i>0<i>3', 'Position');
 
 done_testing;
 

diff --git a/t/index/corenlp_morpho.t b/t/index/corenlp_morpho.t
index 916053e..c9afdca 100644
--- a/t/index/corenlp_morpho.t
+++ b/t/index/corenlp_morpho.t

@@ -15,7 +15,8 @@
 my $data = $tokens->to_data->{data};
 like($data->{foundries}, qr!corenlp/morpho!, 'data');
 like($data->{layerInfos}, qr!corenlp/p=tokens!, 'data');
-is($data->{stream}->[0]->[2], 'corenlp/p:APPRART', 'POS');
+is($data->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Text boundary');
+is($data->{stream}->[0]->[3], 'corenlp/p:APPRART', 'POS');
 is($data->{stream}->[1]->[1], 'corenlp/p:ADJ', 'POS');
 is($data->{stream}->[2]->[1], 'corenlp/p:ADJA', 'POS');
 

diff --git a/t/index/corenlp_sentences.t b/t/index/corenlp_sentences.t
index 4242010..0102748 100644
--- a/t/index/corenlp_sentences.t
+++ b/t/index/corenlp_sentences.t

@@ -17,8 +17,9 @@
 like($data->{foundries}, qr!corenlp/sentences!, 'data');
 is($data->{stream}->[0]->[0], '-:corenlp/sentences$<i>1', 'Number of paragraphs');
 is($data->{stream}->[0]->[1], '-:tokens$<i>18', 'Number of tokens');
-is($data->{stream}->[0]->[2], '<>:corenlp/s:s$<b>64<i>0<i>129<i>17<b>0', 'Text');
-is($data->{stream}->[0]->[3], '_0$<i>0<i>3', 'Position');
+is($data->{stream}->[0]->[2], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Text boundary');
+is($data->{stream}->[0]->[3], '<>:corenlp/s:s$<b>64<i>0<i>129<i>17<b>0', 'Text');
+is($data->{stream}->[0]->[4], '_0$<i>0<i>3', 'Position');
 is($data->{stream}->[-1]->[0], '_17$<i>124<i>128', 'Position');
 
 done_testing;

diff --git a/t/index/dereko_struct.t b/t/index/dereko_struct.t
index f6f6f5e..4ecf608 100644
--- a/t/index/dereko_struct.t
+++ b/t/index/dereko_struct.t

@@ -22,13 +22,12 @@
    '<>:dereko/s:idsHeader$<b>65<i>0<i>0<i>0<b>1<s>2',
    'Empty element');
 
-# Attributes:
-is($data->{stream}->[0]->[10],
-   '@:dereko/s:version:1.1$<b>17<s>2',
-   'Attribute of idsHeader');
 
+is($data->{stream}->[0]->[5], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Text boundary');
+
+# Attributes:
 is($data->{stream}->[0]->[11],
-   '@:dereko/s:TEIform:teiHeader$<b>17<s>2',
+   '@:dereko/s:type:text$<b>17<s>2',
    'Attribute of idsHeader');
 
 is($data->{stream}->[0]->[12],
@@ -36,9 +35,11 @@
    'Attribute of idsHeader');
 
 is($data->{stream}->[0]->[13],
-   '@:dereko/s:type:text$<b>17<s>2',
+   '@:dereko/s:version:1.1$<b>17<s>2',
    'Attribute of idsHeader');
 
+
+
 is($data->{stream}->[0]->[14],
    '@:dereko/s:pattern:text$<b>17<s>2',
    'Attribute of idsHeader');

diff --git a/t/index/glemm_morpho.t b/t/index/glemm_morpho.t
index 8959277..34e70c5 100644
--- a/t/index/glemm_morpho.t
+++ b/t/index/glemm_morpho.t

@@ -17,7 +17,8 @@
 like($data->{foundries}, qr!glemm/morpho!, 'data');
 like($data->{layerInfos}, qr!glemm/l=tokens!, 'data');
 
-is($data->{stream}->[0]->[2], 'glemm/l:__zu', 'Lemma');
+is($data->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Text boundary');
+is($data->{stream}->[0]->[3], 'glemm/l:__zu', 'Lemma');
 is($data->{stream}->[1]->[1], 'glemm/l:__letzt-', 'Lemma');
 
 is($data->{stream}->[3]->[1], 'glemm/l:_+an-', 'Lemma');

diff --git a/t/index/mate_morpho.t b/t/index/mate_morpho.t
index 6400a42..1f5fb91 100644
--- a/t/index/mate_morpho.t
+++ b/t/index/mate_morpho.t

@@ -19,11 +19,12 @@
 like($data->{layerInfos}, qr!mate/l=tokens!, 'data');
 like($data->{layerInfos}, qr!mate/m=tokens!, 'data');
 
-is($data->{stream}->[0]->[3], 'mate/l:zu', 'POS');
-is($data->{stream}->[0]->[4], 'mate/m:case:dat', 'POS');
-is($data->{stream}->[0]->[5], 'mate/m:gender:neut', 'POS');
-is($data->{stream}->[0]->[6], 'mate/m:number:sg', 'POS');
-is($data->{stream}->[0]->[7], 'mate/p:APPRART', 'POS');
+is($data->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Text boundary');
+is($data->{stream}->[0]->[4], 'mate/l:zu', 'POS');
+is($data->{stream}->[0]->[5], 'mate/m:case:dat', 'POS');
+is($data->{stream}->[0]->[6], 'mate/m:gender:neut', 'POS');
+is($data->{stream}->[0]->[7], 'mate/m:number:sg', 'POS');
+is($data->{stream}->[0]->[8], 'mate/p:APPRART', 'POS');
 
 is($data->{stream}->[-1]->[2], 'mate/l:werden', 'POS');
 is($data->{stream}->[-1]->[3], 'mate/m:mood:ind', 'POS');

diff --git a/t/index/mate_morpho_attr.t b/t/index/mate_morpho_attr.t
index 792f139..04daffa 100644
--- a/t/index/mate_morpho_attr.t
+++ b/t/index/mate_morpho_attr.t

@@ -18,11 +18,12 @@
 like($data->{layerInfos}, qr!mate/p=tokens!, 'data');
 like($data->{layerInfos}, qr!mate/l=tokens!, 'data');
 
-is($data->{stream}->[0]->[1], '@:gender=neut$<b>16<s>1', 'POS');
-is($data->{stream}->[0]->[2], '@:number=sg$<b>16<s>1', 'POS');
-is($data->{stream}->[0]->[3], '@:case=dat$<b>16<s>1', 'POS');
-is($data->{stream}->[0]->[6], 'mate/l:zu', 'Lemmata');
-is($data->{stream}->[0]->[7], 'mate/p:APPRART$<b>128<s>1', 'POS');
+is($data->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Text boundary');
+is($data->{stream}->[0]->[2], '@:gender=neut$<b>16<s>1', 'POS');
+is($data->{stream}->[0]->[3], '@:number=sg$<b>16<s>1', 'POS');
+is($data->{stream}->[0]->[4], '@:case=dat$<b>16<s>1', 'POS');
+is($data->{stream}->[0]->[7], 'mate/l:zu', 'Lemmata');
+is($data->{stream}->[0]->[8], 'mate/p:APPRART$<b>128<s>1', 'POS');
 
 is($data->{stream}->[-1]->[0], '@:mood=ind$<b>16<s>1', 'POS');
 is($data->{stream}->[-1]->[1], '@:tense=pres$<b>16<s>1', 'POS');

diff --git a/t/index/opennlp_morpho.t b/t/index/opennlp_morpho.t
index cf57006..eb94a31 100644
--- a/t/index/opennlp_morpho.t
+++ b/t/index/opennlp_morpho.t

@@ -10,7 +10,7 @@
 
 ok(my $tokens = TestInit::tokens('0001'), 'Parse tokens');
 
-is($tokens->stream->pos(0)->to_string, '[(0-3)-:tokens$<i>18|_0$<i>0<i>3|i:zum|s:Zum]', 'Token is correct');
+is($tokens->stream->pos(0)->to_string, '[(0-3)-:tokens$<i>18|<>:base/s:t$<b>64<i>0<i>129<i>17<b>0|_0$<i>0<i>3|i:zum|s:Zum]', 'Token is correct');
 
 is($tokens->stream->pos(1)->to_string, '[(4-11)_1$<i>4<i>11|i:letzten|s:letzten]', 'Token is correct');
 
@@ -42,16 +42,13 @@
 
 ok(!$tokens->stream->pos($i++), 'No more tokens');
 
-
-
-
 ok($tokens->add('OpenNLP', 'Morpho'), 'Add Structure');
 
 my $data = $tokens->to_data->{data};
 
 like($data->{foundries}, qr!opennlp/morpho!, 'data');
-is($data->{stream}->[0]->[1], '_0$<i>0<i>3', 'Position');
-is($data->{stream}->[0]->[3], 'opennlp/p:APPRART', 'POS');
+is($data->{stream}->[0]->[2], '_0$<i>0<i>3', 'Position');
+is($data->{stream}->[0]->[4], 'opennlp/p:APPRART', 'POS');
 is($data->{stream}->[1]->[2], 'opennlp/p:ADJA', 'POS');
 is($data->{stream}->[2]->[2], 'opennlp/p:ADJA', 'POS');
 is($data->{stream}->[-1]->[2], 'opennlp/p:VAFIN', 'POS');

diff --git a/t/index/opennlp_sentences.t b/t/index/opennlp_sentences.t
index 73808d6..98bb243 100644
--- a/t/index/opennlp_sentences.t
+++ b/t/index/opennlp_sentences.t

@@ -17,8 +17,8 @@
 like($data->{foundries}, qr!opennlp/sentences!, 'data');
 is($data->{stream}->[0]->[0], '-:opennlp/sentences$<i>1', 'Number of Sentences');
 is($data->{stream}->[0]->[1], '-:tokens$<i>18', 'Number of tokens');
-is($data->{stream}->[0]->[2], '<>:opennlp/s:s$<b>64<i>0<i>129<i>17<b>0', 'Sentence');
-is($data->{stream}->[0]->[3], '_0$<i>0<i>3', 'Position');
+is($data->{stream}->[0]->[3], '<>:opennlp/s:s$<b>64<i>0<i>129<i>17<b>0', 'Sentence');
+is($data->{stream}->[0]->[4], '_0$<i>0<i>3', 'Position');
 
 done_testing;
 

diff --git a/t/index/tt_morpho.t b/t/index/tt_morpho.t
index e529af8..5d50b38 100644
--- a/t/index/tt_morpho.t
+++ b/t/index/tt_morpho.t

@@ -18,8 +18,8 @@
 like($data->{layerInfos}, qr!tt/p=tokens!, 'data');
 like($data->{layerInfos}, qr!tt/l=tokens!, 'data');
 
-is($data->{stream}->[0]->[4], 'tt/l:zum$<b>129<b>255', 'POS');
-is($data->{stream}->[0]->[5], 'tt/p:APPRART$<b>129<b>255', 'POS');
+is($data->{stream}->[0]->[5], 'tt/l:zum$<b>129<b>255', 'POS');
+is($data->{stream}->[0]->[6], 'tt/p:APPRART$<b>129<b>255', 'POS');
 
 is($data->{stream}->[3]->[3], 'tt/l:Anlaß$<b>129<b>255', 'POS');
 is($data->{stream}->[3]->[4], 'tt/p:NN$<b>129<b>255', 'POS');

diff --git a/t/index/tt_sentences.t b/t/index/tt_sentences.t
index 04c5892..c47ebb6 100644
--- a/t/index/tt_sentences.t
+++ b/t/index/tt_sentences.t

@@ -19,8 +19,8 @@
 like($data->{foundries}, qr!treetagger/sentences!, 'data');
 is($data->{stream}->[0]->[0], '-:tokens$<i>18', 'Number of tokens');
 is($data->{stream}->[0]->[1], '-:tt/sentences$<i>1', 'Number of paragraphs');
-is($data->{stream}->[0]->[2], '<>:tt/s:s$<b>64<i>0<i>130<i>17<b>0', 'Text');
-is($data->{stream}->[0]->[3], '_0$<i>0<i>3', 'Position');
+is($data->{stream}->[0]->[3], '<>:tt/s:s$<b>64<i>0<i>130<i>17<b>0', 'Text');
+is($data->{stream}->[0]->[4], '_0$<i>0<i>3', 'Position');
 is($data->{stream}->[-1]->[0], '_17$<i>124<i>128', 'Position');
 
 done_testing;

diff --git a/t/index/xip_constituency.t b/t/index/xip_constituency.t
index 89972e4..17e6c7e 100644
--- a/t/index/xip_constituency.t
+++ b/t/index/xip_constituency.t

@@ -18,9 +18,11 @@
 
 # The length includes the punct - but that doesn't matter
 is($data->{stream}->[0]->[1], '<>:xip/c:PREP$<b>64<i>0<i>3<i>1<b>3', 'Prep phrase');
-is($data->{stream}->[0]->[2], '<>:xip/c:PP$<b>64<i>0<i>30<i>4<b>2', 'pp phrase');
-is($data->{stream}->[0]->[3], '<>:xip/c:TOP$<b>64<i>0<i>129<i>17<b>0', 'top phrase');
-is($data->{stream}->[0]->[4], '<>:xip/c:MC$<b>64<i>0<i>129<i>17<b>1', 'mc phrase');
+is($data->{stream}->[0]->[4], '<>:xip/c:PP$<b>64<i>0<i>30<i>4<b>2', 'pp phrase');
+done_testing;
+__END__
+is($data->{stream}->[0]->[6], '<>:xip/c:TOP$<b>64<i>0<i>129<i>17<b>0', 'top phrase');
+is($data->{stream}->[0]->[7], '<>:xip/c:MC$<b>64<i>0<i>129<i>17<b>1', 'mc phrase');
 
 is($data->{stream}->[-1]->[0], '<>:xip/c:VERB$<b>64<i>124<i>128<i>18<b>4', 'Noun phrase');
commit	ee443f9a28cdb83f2590663bf993f06d70700bf1	[log] [tgz]
author	Akron <nils@diewald-online.de>	Thu Feb 25 23:56:49 2016 +0100
committer	Akron <nils@diewald-online.de>	Thu Feb 25 23:56:49 2016 +0100
tree	d4629db4d25887d1c645a2d698a919770d7db934
parent	941c1a69f4d15a4b7fbf9add4b771906c4904bc0 [diff]