Fix buffer bug in token writer Change-Id: I615618fe7833e1b97ae86b23d1fee760401154db

commit: cae39115eefcf3d8b813c62a367e89ea8251ad79 [log] [tgz]
author: Akron <nils@diewald-online.de> Wed Apr 26 19:43:16 2023 +0200
committer: Akron <nils@diewald-online.de> Wed Apr 26 19:43:16 2023 +0200
tree: 33e87a30fcacf442dc9312cf3a0ccf47b7ec38c1
parent: d0dfea8258ea50be202a9f500e76a9285c8ad01e [diff]
diff --git a/Changes b/Changes
index 629a776..cdc4937 100644
--- a/Changes
+++ b/Changes

@@ -1,3 +1,7 @@
+0.2.0 2023-04-26
+    - Add english tokenizer.
+    - Fix buffer bug.
+
 0.1.7 2023-02-28
     - Add dependabot checks.
     - Add update command.

diff --git a/datok.go b/datok.go
index 48bb757..f19ce5a 100644
--- a/datok.go
+++ b/datok.go

@@ -932,8 +932,12 @@
 				// token and start blank at the root node of the automaton for the remaining data.
 				// It may be beneficial to have something like a "drop()" event to capture these cases,
 				// as they are likely the result of a bad automaton design.
-				if buffc-bufft == 0 {
+				if buffc-bufft <= 0 {
 					buffc++
+					if buffc == 0 {
+						eof = true
+						break
+					}
 				}
 
 				if DEBUG {

diff --git a/matrix.go b/matrix.go
index 1861528..eb88086 100644
--- a/matrix.go
+++ b/matrix.go

@@ -503,13 +503,21 @@
 				// token and start blank at the root node of the automaton for the remaining data.
 				// It may be beneficial to have something like a "drop()" event to capture these cases,
 				// as they are likely the result of a bad automaton design.
-				if buffc-bufft == 0 {
+
+				//			fmt.Println("Problem", len(buffer), buffc, bufft)
+
+				if buffc-bufft <= 0 {
 					buffc++
+					if buffc == 0 {
+						eof = true
+						break
+					}
 				}
 
 				if DEBUG {
 					log.Println("-> Flush buffer: [", string(buffer[bufft:buffc]), "]", showBufferNew(buffer, bufft, buffc, buffi))
 				}
+
 				w.Token(bufft, buffer[:buffc])
 
 				sentenceEnd = false

diff --git a/matrix_test.go b/matrix_test.go
index 31812d7..40ddb8d 100644
--- a/matrix_test.go
+++ b/matrix_test.go

@@ -104,6 +104,35 @@
 	assert.Equal("bauamt", tokens[3])
 }
 
+func TestMatrixCliticRule(t *testing.T) {
+	assert := assert.New(t)
+	mat := LoadMatrixFile("testdata/clitic_test.matok")
+
+	b := make([]byte, 0, 2048)
+	w := bytes.NewBuffer(b)
+	var tokens []string
+
+	tokens = ttokenize(mat, w, "ibauamt")
+	assert.Equal("ibauamt", tokens[0])
+
+	exstring := "dead. "
+
+	tokens = ttokenize(mat, w, exstring)
+	assert.Equal("dead", tokens[0])
+	assert.Equal(".", tokens[1])
+
+	w.Reset()
+	tws := NewTokenWriter(w, TOKENS|SENTENCES)
+
+	assert.True(mat.TransduceTokenWriter(
+		strings.NewReader(exstring), tws),
+	)
+	tws.Flush()
+
+	matStr := w.String()
+	assert.Equal("dead\n.\n\n\n\n\n\n\n", matStr)
+}
+
 func TestMatrixReadWriteTokenizer(t *testing.T) {
 	assert := assert.New(t)
 	foma := LoadFomaFile("testdata/simpletok.fst")

diff --git a/testdata/clitic_test.fst b/testdata/clitic_test.fst
new file mode 100644
index 0000000..b373352
--- /dev/null
+++ b/testdata/clitic_test.fst
Binary files differ

diff --git a/testdata/clitic_test.matok b/testdata/clitic_test.matok
new file mode 100644
index 0000000..9cc4971
--- /dev/null
+++ b/testdata/clitic_test.matok
Binary files differ

diff --git a/testdata/clitic_test.xfst b/testdata/clitic_test.xfst
new file mode 100644
index 0000000..1b129aa
--- /dev/null
+++ b/testdata/clitic_test.xfst

@@ -0,0 +1,13 @@
+define TB "@_TOKEN_BOUND_@";
+define WS [" "|"\u000a"|"\u0009"];
+define PUNCT ["."|"?"|"!"];
+define Char \[WS|PUNCT];
+define Word Char+;
+
+! Compose token boundaries
+define Tokenizer [[Word|PUNCT] @-> ... TB] .o.
+ ! Compose Whitespace ignorance
+[WS+ @-> 0] .o.
+ ! Compose sentence ends
+[[PUNCT+] @-> ... TB \/ TB _ ];
+read regex Tokenizer .o. [{'re} ->@ TB ... ];
commit	cae39115eefcf3d8b813c62a367e89ea8251ad79	[log] [tgz]
author	Akron <nils@diewald-online.de>	Wed Apr 26 19:43:16 2023 +0200
committer	Akron <nils@diewald-online.de>	Wed Apr 26 19:43:16 2023 +0200
tree	33e87a30fcacf442dc9312cf3a0ccf47b7ec38c1
parent	d0dfea8258ea50be202a9f500e76a9285c8ad01e [diff]