Introduce token_writer object This change also removes final state sensibility from the tokenizer. Tokens now require a tokenend transition to be treated as complete.

commit: e396a93ea5848a941e664f992aed89b057ca3120 [log] [tgz]
author: Akron <nils@diewald-online.de> Tue Oct 19 01:06:13 2021 +0200
committer: Akron <nils@diewald-online.de> Wed Oct 20 19:06:07 2021 +0200
tree: aae1c3781aa0e28b5ef3b134c5017637830e6952
parent: e7751b807975e77757a63456aa89094f490b9dce [diff]
diff --git a/Readme.md b/Readme.md
index a7930a2..956bcf6 100644
--- a/Readme.md
+++ b/Readme.md

@@ -23,6 +23,8 @@
   the `@_TOKEN_SYMBOL_@`.
 - Two consecutive `@_TOKEN_SYMBOL_@`s mark a sentence end.
 - Flag diacritics are not supported.
+- Final states are ignored. The `@_TOKEN_SYMBOL_@` marks
+  the end of a token instead.
 
 A minimal usable tokenizer written in XFST and following
 the guidelines to tokenizers in Beesley and Karttunen (2003)

diff --git a/datok.go b/datok.go
index fa6e44d..b70586c 100644
--- a/datok.go
+++ b/datok.go

@@ -724,14 +724,19 @@
 	return string(out)
 }
 
-// Transduce an input string against the double array
-// FSA. The rules are always greedy. If the automaton fails,
-// it takes the last possible token ending branch.
+func (dat *DaTokenizer) Transduce(r io.Reader, w io.Writer) bool {
+	return dat.TransduceTokenWriter(r, NewTokenWriterSimple(w))
+}
+
+// TransduceTokenWriter transduces an input string against
+// the double array FSA. The rules are always greedy. If the
+// automaton fails, it takes the last possible token ending
+// branch.
 //
 // Based on Mizobuchi et al (2000), p. 129,
 // with additional support for IDENTITY, UNKNOWN
 // and EPSILON transitions and NONTOKEN and TOKENEND handling.
-func (dat *DaTokenizer) Transduce(r io.Reader, w io.Writer) bool {
+func (dat *DaTokenizer) TransduceTokenWriter(r io.Reader, w TokenWriterI) bool {
 	var a int
 	var t0 uint32
 	t := uint32(1) // Initial state
@@ -742,6 +747,9 @@
 	epsilonState := uint32(0)
 	epsilonOffset := 0
 
+	// Remember if the last transition was epsilon
+	sentenceEnd := false
+
 	// Implement a low level buffer for full control,
 	// however - it is probably better to introduce
 	// this on a higher level with a io.Reader interface
@@ -761,10 +769,10 @@
 	buffi := 0 // Buffer length
 
 	reader := bufio.NewReader(r)
-	writer := bufio.NewWriter(w)
-	defer writer.Flush()
+	defer w.Flush()
 
 	var char rune
+
 	var err error
 	eof := false
 	newchar := true
@@ -819,6 +827,10 @@
 				// Remember state for backtracking to last tokenend state
 				epsilonState = t0
 				epsilonOffset = buffo
+
+				if DEBUG {
+					fmt.Println("epsilonOffset is set to", buffo)
+				}
 			}
 		}
 
@@ -885,27 +897,33 @@
 				}
 				rewindBuffer = true
 			}
-		}
 
-		// Transition marks the end of a token - so flush the buffer
-		if ta.isTokenEnd() {
+		} else {
 
-			if buffi > 0 {
+			// Transition marks the end of a token - so flush the buffer
+			if buffo > 0 {
 				if DEBUG {
 					fmt.Println("-> Flush buffer: [", string(buffer[:buffo]), "]", showBuffer(buffer, buffo, buffi))
 				}
-				writer.WriteString(string(buffer[:buffo]))
+				w.Token(0, buffer[:buffo])
 				rewindBuffer = true
+				sentenceEnd = false
+			} else {
+				sentenceEnd = true
+				w.SentenceEnd()
 			}
 			if DEBUG {
 				fmt.Println("-> Newline")
 			}
-			writer.WriteRune('\n')
 		}
 
 		// Rewind the buffer if necessary
 		if rewindBuffer {
 
+			if DEBUG {
+				fmt.Println("-> Rewind buffer", buffo, buffi, epsilonOffset)
+			}
+
 			// TODO: Better as a ring buffer
 			for x, i := range buffer[buffo:buffi] {
 				buffer[x] = i
@@ -913,7 +931,9 @@
 
 			buffi -= buffo
 			// epsilonOffset -= buffo
-			epsilonOffset = buffo
+			epsilonOffset = 0
+			epsilonState = 0
+
 			buffo = 0
 			if DEBUG {
 				fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
@@ -948,46 +968,43 @@
 		fmt.Println("Entering final check")
 	}
 
-	// Automaton is in a final state, so flush the buffer and return
-	x := dat.array[t].getBase() + uint32(dat.final)
+	/*
+		The following code is for deprecated automata relying on
+		final states. Datok now requires final states to be marked
+		with tokenends.
 
-	if x < dat.array[1].getCheck() && dat.array[x].getCheck() == t {
+			// Automaton is in a final state, so flush the buffer and return
+			x := dat.array[t].getBase() + uint32(dat.final)
 
-		if buffi > 0 {
-			if DEBUG {
-				fmt.Println("-> Flush buffer: [", string(buffer[:buffi]), "]")
-			}
-			writer.WriteString(string(buffer[:buffi]))
+			if x < dat.array[1].getCheck() && dat.array[x].getCheck() == t {
 
-			if dat.array[t].isTokenEnd() {
-				writer.WriteRune('\n')
-				if DEBUG {
-					fmt.Println("-> Newline")
+				if buffi > 0 {
+					if DEBUG {
+						fmt.Println("-> Flush buffer: [", string(buffer[:buffi]), "]")
+					}
+					w.Token(0, buffer[:buffi])
 				}
-			}
-		}
 
-		// Add an additional sentence ending, if the file is over but no explicit
-		// sentence split was reached. This may be controversial and therefore
-		// optional via parameter.
-		if !dat.array[t0].isTokenEnd() {
-			writer.WriteRune('\n')
-			if DEBUG {
-				fmt.Println("-> Newline")
-			}
-		}
+				// Add an additional sentence ending, if the file is over but no explicit
+				// sentence split was reached. This may be controversial and therefore
+				// optional via parameter.
+				if !dat.array[t0].isTokenEnd() {
+					w.SentenceEnd()
+				}
 
-		// TODO:
-		//   There may be a new line at the end, from an epsilon,
-		//   so we may need to go on!
-		return true
-	}
+				// TODO:
+				//   There may be a new line at the end, from an epsilon,
+				//   so we may need to go on!
+				return true
+			}
+	*/
 
 	// Check epsilon transitions until a final state is reached
 	t0 = t
 	t = dat.array[t0].getBase() + uint32(dat.epsilon)
 	a = dat.epsilon
 	newchar = false
+
 	if dat.array[t].getCheck() == t0 {
 		// Remember state for backtracking to last tokenend state
 		goto PARSECHAR
@@ -1001,5 +1018,18 @@
 		}
 		goto PARSECHAR
 	}
-	return false
+
+	// Add an additional sentence ending, if the file is over but no explicit
+	// sentence split was reached. This may be controversial and therefore
+	// optional via parameter.
+	if !sentenceEnd {
+		// writer.WriteRune('\n')
+		// ::Sentenceend
+		w.SentenceEnd()
+		if DEBUG {
+			fmt.Println("-> Newline")
+		}
+	}
+
+	return true
 }

diff --git a/datok_test.go b/datok_test.go
index d1c0165..1143eb1 100644
--- a/datok_test.go
+++ b/datok_test.go

@@ -38,7 +38,7 @@
 	assert.True(tmatch(dat, "bau"))
 	assert.True(tmatch(dat, "bauamt"))
 	assert.False(tmatch(dat, "baum"))
-	assert.False(tmatch(dat, "baua"))
+	assert.True(tmatch(dat, "baua"))
 }
 
 func TestSimpleBranches(t *testing.T) {
@@ -47,7 +47,7 @@
 	// (bau | wahl) (amt | en)
 	tok := LoadFomaFile("testdata/wahlamt.fst")
 	dat := tok.ToDoubleArray()
-	assert.False(tmatch(dat, "bau"))
+	assert.True(tmatch(dat, "bau"))
 	assert.True(tmatch(dat, "bauamt"))
 	assert.True(tmatch(dat, "wahlamt"))
 	assert.True(tmatch(dat, "bauen"))
@@ -140,6 +140,10 @@
 }
 
 func TestIgnorableMCS(t *testing.T) {
+
+	// This test relies on final states. That's why it is
+	// not working correctly anymore.
+
 	assert := assert.New(t)
 	// File has MCS in sigma but not in net
 	tok := LoadFomaFile("testdata/ignorable_mcs.fst")
@@ -152,13 +156,13 @@
 	var tokens []string
 
 	// Is only unambigous when transducing strictly greedy!
-	assert.True(dat.Transduce(strings.NewReader("ab<ab>"), w))
+	assert.True(dat.Transduce(strings.NewReader("ab<ab>a"), w))
 	tokens = strings.Split(w.String(), "\n")
-	assert.Equal("a\nb\n<ab>\n", w.String())
+	assert.Equal("a\nb\n<ab>a\n\n", w.String())
 	assert.Equal("a", tokens[0])
 	assert.Equal("b", tokens[1])
-	assert.Equal("<ab>", tokens[2])
-	assert.Equal(4, len(tokens))
+	assert.Equal("<ab>a", tokens[2])
+	assert.Equal(5, len(tokens))
 	assert.Equal(dat.TransCount(), 15)
 }
 
@@ -1018,3 +1022,13 @@
 //   BenchmarkToDoubleArray-4                   71919             16083 ns/op           10702 B/op         29 allocs/op
 //   BenchmarkToDoubleArrayLarger-4                16          68012819 ns/op         6357920 B/op       2578 allocs/op
 //   BenchmarkTransduceMatrix-4                 51529             23678 ns/op            8240 B/op          3 allocs/op
+// 2021-10-12 - Introduction of Callbacks in Matrix
+//   BenchmarkTransduce-4                       46947             26043 ns/op            8240 B/op          3 allocs/op
+//   BenchmarkToDoubleArray-4                   65192             16501 ns/op           10703 B/op         29 allocs/op
+//   BenchmarkToDoubleArrayLarger-4                15          69263576 ns/op         6357859 B/op       2577 allocs/op
+//   BenchmarkTransduceMatrix-4                 49928             26313 ns/op           12408 B/op          6 allocs/op
+// 2021-10-18 - Introduction of Callbacks in DA
+//   BenchmarkTransduce-4                       41055             30058 ns/op           12408 B/op          6 allocs/op
+//   BenchmarkToDoubleArray-4                   64672             17659 ns/op           10703 B/op         29 allocs/op
+//   BenchmarkToDoubleArrayLarger-4                15          71640553 ns/op         6357865 B/op       2577 allocs/op
+//   BenchmarkTransduceMatrix-4                 47036             26009 ns/op           12408 B/op          6 allocs/op

diff --git a/matrix.go b/matrix.go
index b800d1a..10680c3 100644
--- a/matrix.go
+++ b/matrix.go

@@ -313,6 +313,10 @@
 }
 
 func (mat *MatrixTokenizer) Transduce(r io.Reader, w io.Writer) bool {
+	return mat.TransduceTokenWriter(r, NewTokenWriterSimple(w))
+}
+
+func (mat *MatrixTokenizer) TransduceTokenWriter(r io.Reader, w TokenWriterI) bool {
 	var a int
 	var t0 uint32
 	t := uint32(1) // Initial state
@@ -331,8 +335,7 @@
 	buffi := 0 // Buffer length
 
 	reader := bufio.NewReader(r)
-	writer := bufio.NewWriter(w)
-	defer writer.Flush()
+	defer w.Flush()
 
 	var char rune
 
@@ -411,7 +414,7 @@
 			fmt.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
 		}
 
-		// Check if the transition is invalid according to the double array
+		// Check if the transition is invalid according to the matrix
 		if t == 0 {
 
 			if DEBUG {
@@ -465,20 +468,21 @@
 
 		} else {
 			// Transition marks the end of a token - so flush the buffer
-			if buffi > 0 {
+			if buffo > 0 {
 				if DEBUG {
 					fmt.Println("-> Flush buffer: [", string(buffer[:buffo]), "]", showBuffer(buffer, buffo, buffi))
 				}
-				writer.WriteString(string(buffer[:buffo]))
+				w.Token(0, buffer[:buffo])
 				rewindBuffer = true
 				sentenceEnd = false
 			} else {
 				sentenceEnd = true
+				w.SentenceEnd()
 			}
 			if DEBUG {
 				fmt.Println("-> Newline")
 			}
-			writer.WriteRune('\n')
+			// writer.WriteRune('\n')
 		}
 
 		// Rewind the buffer if necessary
@@ -548,7 +552,9 @@
 	// sentence split was reached. This may be controversial and therefore
 	// optional via parameter.
 	if !sentenceEnd {
-		writer.WriteRune('\n')
+		// writer.WriteRune('\n')
+		// ::Sentenceend
+		w.SentenceEnd()
 		if DEBUG {
 			fmt.Println("-> Newline")
 		}

diff --git a/matrix_test.go b/matrix_test.go
index b3af1a7..cc45b8f 100644
--- a/matrix_test.go
+++ b/matrix_test.go

@@ -97,6 +97,32 @@
 	assert.True(tmatch(mat2, "wald gehen"))
 }
 
+func TestMatrixIgnorableMCS(t *testing.T) {
+	assert := assert.New(t)
+
+	// This test relies on final states. That's why it is
+	// not working correctly anymore.
+
+	// File has MCS in sigma but not in net
+	tok := LoadFomaFile("testdata/ignorable_mcs.fst")
+	assert.NotNil(tok)
+	mat := tok.ToMatrix()
+	assert.NotNil(mat)
+
+	b := make([]byte, 0, 2048)
+	w := bytes.NewBuffer(b)
+	var tokens []string
+
+	// Is only unambigous when transducing strictly greedy!
+	assert.True(mat.Transduce(strings.NewReader("ab<ab>a"), w))
+	tokens = strings.Split(w.String(), "\n")
+	assert.Equal("a\nb\n<ab>a\n\n", w.String())
+	assert.Equal("a", tokens[0])
+	assert.Equal("b", tokens[1])
+	assert.Equal("<ab>a", tokens[2])
+	assert.Equal(5, len(tokens))
+}
+
 func TestReadWriteMatrixFullTokenizer(t *testing.T) {
 	assert := assert.New(t)
 	foma := LoadFomaFile("testdata/tokenizer.fst")
@@ -856,6 +882,20 @@
 	assert.Equal(datStr, matStr)
 }
 
+func TestFullTokenizerMatrixCallbackTransduce(t *testing.T) {
+	assert := assert.New(t)
+
+	mat := LoadMatrixFile("testdata/tokenizer.matok")
+
+	assert.NotNil(mat)
+
+	b := make([]byte, 0, 2048)
+	w := bytes.NewBuffer(b)
+	// var tokens []string
+
+	assert.True(mat.Transduce(strings.NewReader("Der alte Baum. Er war schon alt."), w))
+}
+
 func BenchmarkTransduceMatrix(b *testing.B) {
 	bu := make([]byte, 0, 2048)
 	w := bytes.NewBuffer(bu)

diff --git a/token_writer.go b/token_writer.go
new file mode 100644
index 0000000..a10f112
--- /dev/null
+++ b/token_writer.go

@@ -0,0 +1,35 @@
+package datok
+
+import (
+	"bufio"
+	"io"
+)
+
+type TokenWriterI interface {
+	SentenceEnd()
+	Token(int, []rune)
+	Flush() error
+}
+
+var _ TokenWriterI = &TokenWriterSimple{}
+
+type TokenWriterSimple struct {
+	writer *bufio.Writer
+}
+
+func NewTokenWriterSimple(w io.Writer) *TokenWriterSimple {
+	return &TokenWriterSimple{bufio.NewWriter(w)}
+}
+
+func (tw *TokenWriterSimple) SentenceEnd() {
+	tw.writer.WriteRune('\n')
+}
+
+func (tw *TokenWriterSimple) Token(_ int, buf []rune) {
+	tw.writer.WriteString(string(buf))
+	tw.writer.WriteRune('\n')
+}
+
+func (tw *TokenWriterSimple) Flush() error {
+	return tw.writer.Flush()
+}

diff --git a/token_writer_test.go b/token_writer_test.go
new file mode 100644
index 0000000..9678157
--- /dev/null
+++ b/token_writer_test.go

@@ -0,0 +1,29 @@
+package datok
+
+import (
+	"bytes"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestTokenWriterSimple(t *testing.T) {
+	assert := assert.New(t)
+
+	b := make([]byte, 0, 2048)
+	w := bytes.NewBuffer(b)
+
+	tws := NewTokenWriterSimple(w)
+
+	assert.NotNil(tws)
+
+	tws.Token(0, []rune{'a', 'b', 'c'})
+
+	tws.Token(0, []rune{'d', 'e', 'f'})
+
+	tws.SentenceEnd()
+
+	tws.Flush()
+
+	assert.Equal("abc\ndef\n\n", w.String())
+}
commit	e396a93ea5848a941e664f992aed89b057ca3120	[log] [tgz]
author	Akron <nils@diewald-online.de>	Tue Oct 19 01:06:13 2021 +0200
committer	Akron <nils@diewald-online.de>	Wed Oct 20 19:06:07 2021 +0200
tree	aae1c3781aa0e28b5ef3b134c5017637830e6952
parent	e7751b807975e77757a63456aa89094f490b9dce [diff]