Introduce token_writer object
This change also removes final state
sensibility from the tokenizer. Tokens
now require a tokenend transition to
be treated as complete.
diff --git a/Readme.md b/Readme.md
index a7930a2..956bcf6 100644
--- a/Readme.md
+++ b/Readme.md
@@ -23,6 +23,8 @@
the `@_TOKEN_SYMBOL_@`.
- Two consecutive `@_TOKEN_SYMBOL_@`s mark a sentence end.
- Flag diacritics are not supported.
+- Final states are ignored. The `@_TOKEN_SYMBOL_@` marks
+ the end of a token instead.
A minimal usable tokenizer written in XFST and following
the guidelines to tokenizers in Beesley and Karttunen (2003)
diff --git a/datok.go b/datok.go
index fa6e44d..b70586c 100644
--- a/datok.go
+++ b/datok.go
@@ -724,14 +724,19 @@
return string(out)
}
-// Transduce an input string against the double array
-// FSA. The rules are always greedy. If the automaton fails,
-// it takes the last possible token ending branch.
+func (dat *DaTokenizer) Transduce(r io.Reader, w io.Writer) bool {
+ return dat.TransduceTokenWriter(r, NewTokenWriterSimple(w))
+}
+
+// TransduceTokenWriter transduces an input string against
+// the double array FSA. The rules are always greedy. If the
+// automaton fails, it takes the last possible token ending
+// branch.
//
// Based on Mizobuchi et al (2000), p. 129,
// with additional support for IDENTITY, UNKNOWN
// and EPSILON transitions and NONTOKEN and TOKENEND handling.
-func (dat *DaTokenizer) Transduce(r io.Reader, w io.Writer) bool {
+func (dat *DaTokenizer) TransduceTokenWriter(r io.Reader, w TokenWriterI) bool {
var a int
var t0 uint32
t := uint32(1) // Initial state
@@ -742,6 +747,9 @@
epsilonState := uint32(0)
epsilonOffset := 0
+ // Remember if the last transition was epsilon
+ sentenceEnd := false
+
// Implement a low level buffer for full control,
// however - it is probably better to introduce
// this on a higher level with a io.Reader interface
@@ -761,10 +769,10 @@
buffi := 0 // Buffer length
reader := bufio.NewReader(r)
- writer := bufio.NewWriter(w)
- defer writer.Flush()
+ defer w.Flush()
var char rune
+
var err error
eof := false
newchar := true
@@ -819,6 +827,10 @@
// Remember state for backtracking to last tokenend state
epsilonState = t0
epsilonOffset = buffo
+
+ if DEBUG {
+ fmt.Println("epsilonOffset is set to", buffo)
+ }
}
}
@@ -885,27 +897,33 @@
}
rewindBuffer = true
}
- }
- // Transition marks the end of a token - so flush the buffer
- if ta.isTokenEnd() {
+ } else {
- if buffi > 0 {
+ // Transition marks the end of a token - so flush the buffer
+ if buffo > 0 {
if DEBUG {
fmt.Println("-> Flush buffer: [", string(buffer[:buffo]), "]", showBuffer(buffer, buffo, buffi))
}
- writer.WriteString(string(buffer[:buffo]))
+ w.Token(0, buffer[:buffo])
rewindBuffer = true
+ sentenceEnd = false
+ } else {
+ sentenceEnd = true
+ w.SentenceEnd()
}
if DEBUG {
fmt.Println("-> Newline")
}
- writer.WriteRune('\n')
}
// Rewind the buffer if necessary
if rewindBuffer {
+ if DEBUG {
+ fmt.Println("-> Rewind buffer", buffo, buffi, epsilonOffset)
+ }
+
// TODO: Better as a ring buffer
for x, i := range buffer[buffo:buffi] {
buffer[x] = i
@@ -913,7 +931,9 @@
buffi -= buffo
// epsilonOffset -= buffo
- epsilonOffset = buffo
+ epsilonOffset = 0
+ epsilonState = 0
+
buffo = 0
if DEBUG {
fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
@@ -948,46 +968,43 @@
fmt.Println("Entering final check")
}
- // Automaton is in a final state, so flush the buffer and return
- x := dat.array[t].getBase() + uint32(dat.final)
+ /*
+ The following code is for deprecated automata relying on
+ final states. Datok now requires final states to be marked
+ with tokenends.
- if x < dat.array[1].getCheck() && dat.array[x].getCheck() == t {
+ // Automaton is in a final state, so flush the buffer and return
+ x := dat.array[t].getBase() + uint32(dat.final)
- if buffi > 0 {
- if DEBUG {
- fmt.Println("-> Flush buffer: [", string(buffer[:buffi]), "]")
- }
- writer.WriteString(string(buffer[:buffi]))
+ if x < dat.array[1].getCheck() && dat.array[x].getCheck() == t {
- if dat.array[t].isTokenEnd() {
- writer.WriteRune('\n')
- if DEBUG {
- fmt.Println("-> Newline")
+ if buffi > 0 {
+ if DEBUG {
+ fmt.Println("-> Flush buffer: [", string(buffer[:buffi]), "]")
+ }
+ w.Token(0, buffer[:buffi])
}
- }
- }
- // Add an additional sentence ending, if the file is over but no explicit
- // sentence split was reached. This may be controversial and therefore
- // optional via parameter.
- if !dat.array[t0].isTokenEnd() {
- writer.WriteRune('\n')
- if DEBUG {
- fmt.Println("-> Newline")
- }
- }
+ // Add an additional sentence ending, if the file is over but no explicit
+ // sentence split was reached. This may be controversial and therefore
+ // optional via parameter.
+ if !dat.array[t0].isTokenEnd() {
+ w.SentenceEnd()
+ }
- // TODO:
- // There may be a new line at the end, from an epsilon,
- // so we may need to go on!
- return true
- }
+ // TODO:
+ // There may be a new line at the end, from an epsilon,
+ // so we may need to go on!
+ return true
+ }
+ */
// Check epsilon transitions until a final state is reached
t0 = t
t = dat.array[t0].getBase() + uint32(dat.epsilon)
a = dat.epsilon
newchar = false
+
if dat.array[t].getCheck() == t0 {
// Remember state for backtracking to last tokenend state
goto PARSECHAR
@@ -1001,5 +1018,18 @@
}
goto PARSECHAR
}
- return false
+
+ // Add an additional sentence ending, if the file is over but no explicit
+ // sentence split was reached. This may be controversial and therefore
+ // optional via parameter.
+ if !sentenceEnd {
+ // writer.WriteRune('\n')
+ // ::Sentenceend
+ w.SentenceEnd()
+ if DEBUG {
+ fmt.Println("-> Newline")
+ }
+ }
+
+ return true
}
diff --git a/datok_test.go b/datok_test.go
index d1c0165..1143eb1 100644
--- a/datok_test.go
+++ b/datok_test.go
@@ -38,7 +38,7 @@
assert.True(tmatch(dat, "bau"))
assert.True(tmatch(dat, "bauamt"))
assert.False(tmatch(dat, "baum"))
- assert.False(tmatch(dat, "baua"))
+ assert.True(tmatch(dat, "baua"))
}
func TestSimpleBranches(t *testing.T) {
@@ -47,7 +47,7 @@
// (bau | wahl) (amt | en)
tok := LoadFomaFile("testdata/wahlamt.fst")
dat := tok.ToDoubleArray()
- assert.False(tmatch(dat, "bau"))
+ assert.True(tmatch(dat, "bau"))
assert.True(tmatch(dat, "bauamt"))
assert.True(tmatch(dat, "wahlamt"))
assert.True(tmatch(dat, "bauen"))
@@ -140,6 +140,10 @@
}
func TestIgnorableMCS(t *testing.T) {
+
+ // This test relies on final states. That's why it is
+ // not working correctly anymore.
+
assert := assert.New(t)
// File has MCS in sigma but not in net
tok := LoadFomaFile("testdata/ignorable_mcs.fst")
@@ -152,13 +156,13 @@
var tokens []string
// Is only unambigous when transducing strictly greedy!
- assert.True(dat.Transduce(strings.NewReader("ab<ab>"), w))
+ assert.True(dat.Transduce(strings.NewReader("ab<ab>a"), w))
tokens = strings.Split(w.String(), "\n")
- assert.Equal("a\nb\n<ab>\n", w.String())
+ assert.Equal("a\nb\n<ab>a\n\n", w.String())
assert.Equal("a", tokens[0])
assert.Equal("b", tokens[1])
- assert.Equal("<ab>", tokens[2])
- assert.Equal(4, len(tokens))
+ assert.Equal("<ab>a", tokens[2])
+ assert.Equal(5, len(tokens))
assert.Equal(dat.TransCount(), 15)
}
@@ -1018,3 +1022,13 @@
// BenchmarkToDoubleArray-4 71919 16083 ns/op 10702 B/op 29 allocs/op
// BenchmarkToDoubleArrayLarger-4 16 68012819 ns/op 6357920 B/op 2578 allocs/op
// BenchmarkTransduceMatrix-4 51529 23678 ns/op 8240 B/op 3 allocs/op
+// 2021-10-12 - Introduction of Callbacks in Matrix
+// BenchmarkTransduce-4 46947 26043 ns/op 8240 B/op 3 allocs/op
+// BenchmarkToDoubleArray-4 65192 16501 ns/op 10703 B/op 29 allocs/op
+// BenchmarkToDoubleArrayLarger-4 15 69263576 ns/op 6357859 B/op 2577 allocs/op
+// BenchmarkTransduceMatrix-4 49928 26313 ns/op 12408 B/op 6 allocs/op
+// 2021-10-18 - Introduction of Callbacks in DA
+// BenchmarkTransduce-4 41055 30058 ns/op 12408 B/op 6 allocs/op
+// BenchmarkToDoubleArray-4 64672 17659 ns/op 10703 B/op 29 allocs/op
+// BenchmarkToDoubleArrayLarger-4 15 71640553 ns/op 6357865 B/op 2577 allocs/op
+// BenchmarkTransduceMatrix-4 47036 26009 ns/op 12408 B/op 6 allocs/op
diff --git a/matrix.go b/matrix.go
index b800d1a..10680c3 100644
--- a/matrix.go
+++ b/matrix.go
@@ -313,6 +313,10 @@
}
func (mat *MatrixTokenizer) Transduce(r io.Reader, w io.Writer) bool {
+ return mat.TransduceTokenWriter(r, NewTokenWriterSimple(w))
+}
+
+func (mat *MatrixTokenizer) TransduceTokenWriter(r io.Reader, w TokenWriterI) bool {
var a int
var t0 uint32
t := uint32(1) // Initial state
@@ -331,8 +335,7 @@
buffi := 0 // Buffer length
reader := bufio.NewReader(r)
- writer := bufio.NewWriter(w)
- defer writer.Flush()
+ defer w.Flush()
var char rune
@@ -411,7 +414,7 @@
fmt.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
}
- // Check if the transition is invalid according to the double array
+ // Check if the transition is invalid according to the matrix
if t == 0 {
if DEBUG {
@@ -465,20 +468,21 @@
} else {
// Transition marks the end of a token - so flush the buffer
- if buffi > 0 {
+ if buffo > 0 {
if DEBUG {
fmt.Println("-> Flush buffer: [", string(buffer[:buffo]), "]", showBuffer(buffer, buffo, buffi))
}
- writer.WriteString(string(buffer[:buffo]))
+ w.Token(0, buffer[:buffo])
rewindBuffer = true
sentenceEnd = false
} else {
sentenceEnd = true
+ w.SentenceEnd()
}
if DEBUG {
fmt.Println("-> Newline")
}
- writer.WriteRune('\n')
+ // writer.WriteRune('\n')
}
// Rewind the buffer if necessary
@@ -548,7 +552,9 @@
// sentence split was reached. This may be controversial and therefore
// optional via parameter.
if !sentenceEnd {
- writer.WriteRune('\n')
+ // writer.WriteRune('\n')
+ // ::Sentenceend
+ w.SentenceEnd()
if DEBUG {
fmt.Println("-> Newline")
}
diff --git a/matrix_test.go b/matrix_test.go
index b3af1a7..cc45b8f 100644
--- a/matrix_test.go
+++ b/matrix_test.go
@@ -97,6 +97,32 @@
assert.True(tmatch(mat2, "wald gehen"))
}
+func TestMatrixIgnorableMCS(t *testing.T) {
+ assert := assert.New(t)
+
+ // This test relies on final states. That's why it is
+ // not working correctly anymore.
+
+ // File has MCS in sigma but not in net
+ tok := LoadFomaFile("testdata/ignorable_mcs.fst")
+ assert.NotNil(tok)
+ mat := tok.ToMatrix()
+ assert.NotNil(mat)
+
+ b := make([]byte, 0, 2048)
+ w := bytes.NewBuffer(b)
+ var tokens []string
+
+ // Is only unambigous when transducing strictly greedy!
+ assert.True(mat.Transduce(strings.NewReader("ab<ab>a"), w))
+ tokens = strings.Split(w.String(), "\n")
+ assert.Equal("a\nb\n<ab>a\n\n", w.String())
+ assert.Equal("a", tokens[0])
+ assert.Equal("b", tokens[1])
+ assert.Equal("<ab>a", tokens[2])
+ assert.Equal(5, len(tokens))
+}
+
func TestReadWriteMatrixFullTokenizer(t *testing.T) {
assert := assert.New(t)
foma := LoadFomaFile("testdata/tokenizer.fst")
@@ -856,6 +882,20 @@
assert.Equal(datStr, matStr)
}
+func TestFullTokenizerMatrixCallbackTransduce(t *testing.T) {
+ assert := assert.New(t)
+
+ mat := LoadMatrixFile("testdata/tokenizer.matok")
+
+ assert.NotNil(mat)
+
+ b := make([]byte, 0, 2048)
+ w := bytes.NewBuffer(b)
+ // var tokens []string
+
+ assert.True(mat.Transduce(strings.NewReader("Der alte Baum. Er war schon alt."), w))
+}
+
func BenchmarkTransduceMatrix(b *testing.B) {
bu := make([]byte, 0, 2048)
w := bytes.NewBuffer(bu)
diff --git a/token_writer.go b/token_writer.go
new file mode 100644
index 0000000..a10f112
--- /dev/null
+++ b/token_writer.go
@@ -0,0 +1,35 @@
+package datok
+
+import (
+ "bufio"
+ "io"
+)
+
+type TokenWriterI interface {
+ SentenceEnd()
+ Token(int, []rune)
+ Flush() error
+}
+
+var _ TokenWriterI = &TokenWriterSimple{}
+
+type TokenWriterSimple struct {
+ writer *bufio.Writer
+}
+
+func NewTokenWriterSimple(w io.Writer) *TokenWriterSimple {
+ return &TokenWriterSimple{bufio.NewWriter(w)}
+}
+
+func (tw *TokenWriterSimple) SentenceEnd() {
+ tw.writer.WriteRune('\n')
+}
+
+func (tw *TokenWriterSimple) Token(_ int, buf []rune) {
+ tw.writer.WriteString(string(buf))
+ tw.writer.WriteRune('\n')
+}
+
+func (tw *TokenWriterSimple) Flush() error {
+ return tw.writer.Flush()
+}
diff --git a/token_writer_test.go b/token_writer_test.go
new file mode 100644
index 0000000..9678157
--- /dev/null
+++ b/token_writer_test.go
@@ -0,0 +1,29 @@
+package datok
+
+import (
+ "bytes"
+ "testing"
+
+ "github.com/stretchr/testify/assert"
+)
+
+func TestTokenWriterSimple(t *testing.T) {
+ assert := assert.New(t)
+
+ b := make([]byte, 0, 2048)
+ w := bytes.NewBuffer(b)
+
+ tws := NewTokenWriterSimple(w)
+
+ assert.NotNil(tws)
+
+ tws.Token(0, []rune{'a', 'b', 'c'})
+
+ tws.Token(0, []rune{'d', 'e', 'f'})
+
+ tws.SentenceEnd()
+
+ tws.Flush()
+
+ assert.Equal("abc\ndef\n\n", w.String())
+}