Sort alphabet prior to xCheck
diff --git a/datokenizer.go b/datokenizer.go
index e128d49..b6e877b 100644
--- a/datokenizer.go
+++ b/datokenizer.go
@@ -18,6 +18,7 @@
"fmt"
"io"
"os"
+ "sort"
"strconv"
"strings"
"unicode/utf8"
@@ -532,6 +533,9 @@
for a := range tok.transitions[s] {
*A = append(*A, a)
}
+
+ // Not required, but simplifies bug hunting
+ sort.Ints(*A)
}
// Based on Mizobuchi et al (2000), p. 124
diff --git a/datokenizer_test.go b/datokenizer_test.go
index 4ba42b3..8cabb67 100644
--- a/datokenizer_test.go
+++ b/datokenizer_test.go
@@ -1,6 +1,7 @@
package datokenizer
import (
+ "fmt"
"testing"
"github.com/stretchr/testify/assert"
@@ -40,13 +41,12 @@
assert.True(tok.Match("wald gehen"))
}
-/*
func TestFullTokenizer(t *testing.T) {
assert := assert.New(t)
tok := ParseFile("testdata/tokenizer.fst")
tok.ToDoubleArray()
+ fmt.Println("Size:", tok.maxSize)
assert.True(tok.Match("bau"))
assert.True(tok.Match("bad"))
assert.True(tok.Match("wald gehen"))
}
-*/
diff --git a/testdata/tokenizer.fst b/testdata/tokenizer.fst
new file mode 100644
index 0000000..ba46207
--- /dev/null
+++ b/testdata/tokenizer.fst
Binary files differ