Combine Niu et al. (2013) and Morita et al. (2001)
diff --git a/datokenizer.go b/datokenizer.go
index 3e01396..5dfe0ca 100644
--- a/datokenizer.go
+++ b/datokenizer.go
@@ -573,9 +573,10 @@
tok.getSet(s, &A)
// Set base to the first free slot in the double array
- base = dat.xCheck(A)
+ // base = dat.xCheck(A)
// base = dat.xCheckSkip(A)
// base = dat.xCheckNiu(A, &block_begin_pos)
+ base = dat.xCheckSkipNiu(A)
dat.array[t].setBase(base)
// TODO:
@@ -651,9 +652,12 @@
// Following Mizobuchi et al (2000) the size of the
// FSA should be stored in check(1).
- // We make the size a bit smaller so we never have to check for boundaries.
- dat.setSize(dat.maxSize + 1)
- dat.array = dat.array[:dat.maxSize+1]
+ // We make the size a bit larger so we never have to check for boundaries.
+ dat.setSize(dat.maxSize + dat.final)
+ if len(dat.array) < dat.maxSize+dat.final {
+ dat.array = append(dat.array, make([]bc, dat.final)...)
+ }
+ dat.array = dat.array[:dat.maxSize+dat.final]
return dat
}
@@ -792,6 +796,31 @@
return base
}
+// This is an implementation of xCheck with the skip-improvement
+// proposed by Morita et al. (2001) for higher outdegrees as
+// proposed by Niu et al. (2013)
+func (dat *DaTokenizer) xCheckSkipNiu(symbols []int) uint32 {
+
+ // Start at the first entry of the double array list
+ base := uint32(1)
+
+ // Or skip the first few entries
+ if len(symbols) >= 3 {
+ base = uint32(math.Abs(float64(dat.maxSize-1)*.9)) + 1
+ }
+
+OVERLAP:
+ // Resize the array if necessary
+ dat.resize(int(base) + dat.final + 1)
+ for _, a := range symbols {
+ if dat.array[int(base)+a].getCheck() != 0 {
+ base++
+ goto OVERLAP
+ }
+ }
+ return base
+}
+
// This is an implementation of xCheck wit an improvement
// proposed by Niu et al. (2013)
func (dat *DaTokenizer) xCheckNiu(symbols []int, block_begin_pos *uint32) uint32 {
diff --git a/datokenizer_test.go b/datokenizer_test.go
index 62df4c7..6e219f1 100644
--- a/datokenizer_test.go
+++ b/datokenizer_test.go
@@ -115,13 +115,11 @@
assert.True(tmatch(dat, "bad"))
assert.True(tmatch(dat, "wald gehen"))
- assert.True(dat.LoadFactor() >= 75)
-
b := make([]byte, 0, 1024)
buf := bytes.NewBuffer(b)
n, err := dat.WriteTo(buf)
assert.Nil(err)
- assert.Equal(int64(208), n)
+ assert.Equal(int64(296), n)
dat2 := ParseDatok(buf)
assert.NotNil(dat2)
@@ -847,6 +845,13 @@
*/
}
+func TestLoadFactor1(t *testing.T) {
+ assert := assert.New(t)
+ tok := LoadFomaFile("testdata/abbr_bench.fst")
+ dat := tok.ToDoubleArray()
+ assert.True(dat.LoadFactor() > 88)
+}
+
func TestFullTokenizerXML(t *testing.T) {
assert := assert.New(t)
@@ -987,3 +992,7 @@
// BenchmarkTransduce-4 36325 38501 ns/op 8240 B/op 3 allocs/op
// BenchmarkToDoubleArray-4 66858 19286 ns/op 10607 B/op 29 allocs/op
// BenchmarkToDoubleArrayLarger-4 18 67428011 ns/op 6360604 B/op 2578 allocs/op
+// 2021-09-02 - xCheckSkipNiu() with .9 and >= 3
+// BenchmarkTransduce-4 37105 27714 ns/op 8240 B/op 3 allocs/op
+// BenchmarkToDoubleArray-4 76600 15973 ns/op 10703 B/op 29 allocs/op
+// BenchmarkToDoubleArrayLarger-4 21 55161934 ns/op 6357889 B/op 2578 allocs/op