Fix end of text behaviour in case of sentence positions
Change-Id: Ic433dd3579d9a79df5734a405e682596c3ccddad
diff --git a/Changes b/Changes
index dd48d28..9af4aaa 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,7 @@
+0.2.2 2023-09-06
+ - Fix behaviour for end of text character positions
+ when no end of sentence occured before.
+
0.2.1 2023-09-05
- Add english tokenizer.
- Fix buffer bug.
diff --git a/datok.go b/datok.go
index fba655e..1dd5cbe 100644
--- a/datok.go
+++ b/datok.go
@@ -1018,6 +1018,10 @@
if eot {
eot = false
+ if !sentenceEnd {
+ sentenceEnd = true
+ w.SentenceEnd(buffc)
+ }
textEnd = true
w.TextEnd(0)
if DEBUG {
diff --git a/matrix.go b/matrix.go
index e2d9858..7eda112 100644
--- a/matrix.go
+++ b/matrix.go
@@ -592,6 +592,10 @@
if eot {
eot = false
+ if !sentenceEnd {
+ sentenceEnd = true
+ w.SentenceEnd(buffc)
+ }
textEnd = true
w.TextEnd(buffc)
rewindBuffer = true
diff --git a/token_writer_test.go b/token_writer_test.go
index 868e69d..63b9c2b 100644
--- a/token_writer_test.go
+++ b/token_writer_test.go
@@ -85,6 +85,18 @@
matStr = w.String()
assert.Equal("1 5 5 6\n1 6\n0 3 3 4\n0 4\n", matStr)
+ w.Reset()
+ mat.TransduceTokenWriter(strings.NewReader("Tree\n\x04\n"), tws)
+
+ matStr = w.String()
+ assert.Equal("0 4\n0 4\n", matStr)
+
+ w.Reset()
+ mat.TransduceTokenWriter(strings.NewReader("Tree.\n\x04\n"), tws)
+
+ matStr = w.String()
+ assert.Equal("0 4 4 5\n0 5\n", matStr)
+
//
// Write sentence offsets without token offsets
tws = NewTokenWriter(w, SENTENCE_POS|NEWLINE_AFTER_EOT)