Combine Niu et al. (2013) and Morita et al. (2001)

commit: 29e306ffff091284836f429eff5fccd634d3f3f1 [log] [tgz]
author: Akron <nils@diewald-online.de> Thu Sep 02 18:29:56 2021 +0200
committer: Akron <nils@diewald-online.de> Thu Sep 02 18:29:56 2021 +0200
tree: 7010c6d3db5bcba84f4bb28412c37d1950ff406d
parent: 679b486279cc0947917d0cfeaf655fe4be15f17e [diff]
diff --git a/datokenizer.go b/datokenizer.go
index 3e01396..5dfe0ca 100644
--- a/datokenizer.go
+++ b/datokenizer.go

@@ -573,9 +573,10 @@
 		tok.getSet(s, &A)
 
 		// Set base to the first free slot in the double array
-		base = dat.xCheck(A)
+		// base = dat.xCheck(A)
 		// base = dat.xCheckSkip(A)
 		// base = dat.xCheckNiu(A, &block_begin_pos)
+		base = dat.xCheckSkipNiu(A)
 		dat.array[t].setBase(base)
 
 		// TODO:
@@ -651,9 +652,12 @@
 
 	// Following Mizobuchi et al (2000) the size of the
 	// FSA should be stored in check(1).
-	// We make the size a bit smaller so we never have to check for boundaries.
-	dat.setSize(dat.maxSize + 1)
-	dat.array = dat.array[:dat.maxSize+1]
+	// We make the size a bit larger so we never have to check for boundaries.
+	dat.setSize(dat.maxSize + dat.final)
+	if len(dat.array) < dat.maxSize+dat.final {
+		dat.array = append(dat.array, make([]bc, dat.final)...)
+	}
+	dat.array = dat.array[:dat.maxSize+dat.final]
 	return dat
 }
 
@@ -792,6 +796,31 @@
 	return base
 }
 
+// This is an implementation of xCheck with the skip-improvement
+// proposed by Morita et al. (2001) for higher outdegrees as
+// proposed by Niu et al. (2013)
+func (dat *DaTokenizer) xCheckSkipNiu(symbols []int) uint32 {
+
+	// Start at the first entry of the double array list
+	base := uint32(1)
+
+	// Or skip the first few entries
+	if len(symbols) >= 3 {
+		base = uint32(math.Abs(float64(dat.maxSize-1)*.9)) + 1
+	}
+
+OVERLAP:
+	// Resize the array if necessary
+	dat.resize(int(base) + dat.final + 1)
+	for _, a := range symbols {
+		if dat.array[int(base)+a].getCheck() != 0 {
+			base++
+			goto OVERLAP
+		}
+	}
+	return base
+}
+
 // This is an implementation of xCheck wit an improvement
 // proposed by Niu et al. (2013)
 func (dat *DaTokenizer) xCheckNiu(symbols []int, block_begin_pos *uint32) uint32 {

diff --git a/datokenizer_test.go b/datokenizer_test.go
index 62df4c7..6e219f1 100644
--- a/datokenizer_test.go
+++ b/datokenizer_test.go

@@ -115,13 +115,11 @@
 	assert.True(tmatch(dat, "bad"))
 	assert.True(tmatch(dat, "wald gehen"))
 
-	assert.True(dat.LoadFactor() >= 75)
-
 	b := make([]byte, 0, 1024)
 	buf := bytes.NewBuffer(b)
 	n, err := dat.WriteTo(buf)
 	assert.Nil(err)
-	assert.Equal(int64(208), n)
+	assert.Equal(int64(296), n)
 
 	dat2 := ParseDatok(buf)
 	assert.NotNil(dat2)
@@ -847,6 +845,13 @@
 	*/
 }
 
+func TestLoadFactor1(t *testing.T) {
+	assert := assert.New(t)
+	tok := LoadFomaFile("testdata/abbr_bench.fst")
+	dat := tok.ToDoubleArray()
+	assert.True(dat.LoadFactor() > 88)
+}
+
 func TestFullTokenizerXML(t *testing.T) {
 	assert := assert.New(t)
 
@@ -987,3 +992,7 @@
 //   BenchmarkTransduce-4                       36325             38501 ns/op            8240 B/op          3 allocs/op
 //   BenchmarkToDoubleArray-4                   66858             19286 ns/op           10607 B/op         29 allocs/op
 //   BenchmarkToDoubleArrayLarger-4                18          67428011 ns/op         6360604 B/op       2578 allocs/op
+// 2021-09-02 - xCheckSkipNiu() with .9 and >= 3
+//   BenchmarkTransduce-4                       37105             27714 ns/op            8240 B/op          3 allocs/op
+//   BenchmarkToDoubleArray-4                   76600             15973 ns/op           10703 B/op         29 allocs/op
+//   BenchmarkToDoubleArrayLarger-4                21          55161934 ns/op         6357889 B/op       2578 allocs/op
commit	29e306ffff091284836f429eff5fccd634d3f3f1	[log] [tgz]
author	Akron <nils@diewald-online.de>	Thu Sep 02 18:29:56 2021 +0200
committer	Akron <nils@diewald-online.de>	Thu Sep 02 18:29:56 2021 +0200
tree	7010c6d3db5bcba84f4bb28412c37d1950ff406d
parent	679b486279cc0947917d0cfeaf655fe4be15f17e [diff]