Fix buffer bug in token writer
Change-Id: I615618fe7833e1b97ae86b23d1fee760401154db
diff --git a/Changes b/Changes
index 629a776..cdc4937 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,7 @@
+0.2.0 2023-04-26
+ - Add english tokenizer.
+ - Fix buffer bug.
+
0.1.7 2023-02-28
- Add dependabot checks.
- Add update command.
diff --git a/datok.go b/datok.go
index 48bb757..f19ce5a 100644
--- a/datok.go
+++ b/datok.go
@@ -932,8 +932,12 @@
// token and start blank at the root node of the automaton for the remaining data.
// It may be beneficial to have something like a "drop()" event to capture these cases,
// as they are likely the result of a bad automaton design.
- if buffc-bufft == 0 {
+ if buffc-bufft <= 0 {
buffc++
+ if buffc == 0 {
+ eof = true
+ break
+ }
}
if DEBUG {
diff --git a/matrix.go b/matrix.go
index 1861528..eb88086 100644
--- a/matrix.go
+++ b/matrix.go
@@ -503,13 +503,21 @@
// token and start blank at the root node of the automaton for the remaining data.
// It may be beneficial to have something like a "drop()" event to capture these cases,
// as they are likely the result of a bad automaton design.
- if buffc-bufft == 0 {
+
+ // fmt.Println("Problem", len(buffer), buffc, bufft)
+
+ if buffc-bufft <= 0 {
buffc++
+ if buffc == 0 {
+ eof = true
+ break
+ }
}
if DEBUG {
log.Println("-> Flush buffer: [", string(buffer[bufft:buffc]), "]", showBufferNew(buffer, bufft, buffc, buffi))
}
+
w.Token(bufft, buffer[:buffc])
sentenceEnd = false
diff --git a/matrix_test.go b/matrix_test.go
index 31812d7..40ddb8d 100644
--- a/matrix_test.go
+++ b/matrix_test.go
@@ -104,6 +104,35 @@
assert.Equal("bauamt", tokens[3])
}
+func TestMatrixCliticRule(t *testing.T) {
+ assert := assert.New(t)
+ mat := LoadMatrixFile("testdata/clitic_test.matok")
+
+ b := make([]byte, 0, 2048)
+ w := bytes.NewBuffer(b)
+ var tokens []string
+
+ tokens = ttokenize(mat, w, "ibauamt")
+ assert.Equal("ibauamt", tokens[0])
+
+ exstring := "dead. "
+
+ tokens = ttokenize(mat, w, exstring)
+ assert.Equal("dead", tokens[0])
+ assert.Equal(".", tokens[1])
+
+ w.Reset()
+ tws := NewTokenWriter(w, TOKENS|SENTENCES)
+
+ assert.True(mat.TransduceTokenWriter(
+ strings.NewReader(exstring), tws),
+ )
+ tws.Flush()
+
+ matStr := w.String()
+ assert.Equal("dead\n.\n\n\n\n\n\n\n", matStr)
+}
+
func TestMatrixReadWriteTokenizer(t *testing.T) {
assert := assert.New(t)
foma := LoadFomaFile("testdata/simpletok.fst")
diff --git a/testdata/clitic_test.fst b/testdata/clitic_test.fst
new file mode 100644
index 0000000..b373352
--- /dev/null
+++ b/testdata/clitic_test.fst
Binary files differ
diff --git a/testdata/clitic_test.matok b/testdata/clitic_test.matok
new file mode 100644
index 0000000..9cc4971
--- /dev/null
+++ b/testdata/clitic_test.matok
Binary files differ
diff --git a/testdata/clitic_test.xfst b/testdata/clitic_test.xfst
new file mode 100644
index 0000000..1b129aa
--- /dev/null
+++ b/testdata/clitic_test.xfst
@@ -0,0 +1,13 @@
+define TB "@_TOKEN_BOUND_@";
+define WS [" "|"\u000a"|"\u0009"];
+define PUNCT ["."|"?"|"!"];
+define Char \[WS|PUNCT];
+define Word Char+;
+
+! Compose token boundaries
+define Tokenizer [[Word|PUNCT] @-> ... TB] .o.
+ ! Compose Whitespace ignorance
+[WS+ @-> 0] .o.
+ ! Compose sentence ends
+[[PUNCT+] @-> ... TB \/ TB _ ];
+read regex Tokenizer .o. [{'re} ->@ TB ... ];