Add skip-method proposed by Morita et al. (2001)
diff --git a/datokenizer.go b/datokenizer.go
index a213b33..3e01396 100644
--- a/datokenizer.go
+++ b/datokenizer.go
@@ -30,6 +30,7 @@
"encoding/binary"
"fmt"
"io"
+ "math"
"os"
"sort"
"strconv"
@@ -573,6 +574,7 @@
// Set base to the first free slot in the double array
base = dat.xCheck(A)
+ // base = dat.xCheckSkip(A)
// base = dat.xCheckNiu(A, &block_begin_pos)
dat.array[t].setBase(base)
@@ -771,6 +773,25 @@
return base
}
+// This is an implementation of xCheck with the skip-improvement
+// proposed by Morita et al. (2001)
+func (dat *DaTokenizer) xCheckSkip(symbols []int) uint32 {
+
+ // Start at the first entry of the double array list
+ base := uint32(math.Abs(float64(dat.maxSize-1) * .9))
+
+OVERLAP:
+ // Resize the array if necessary
+ dat.resize(int(base) + dat.final)
+ for _, a := range symbols {
+ if dat.array[int(base)+a].getCheck() != 0 {
+ base++
+ goto OVERLAP
+ }
+ }
+ return base
+}
+
// This is an implementation of xCheck wit an improvement
// proposed by Niu et al. (2013)
func (dat *DaTokenizer) xCheckNiu(symbols []int, block_begin_pos *uint32) uint32 {
diff --git a/datokenizer_test.go b/datokenizer_test.go
index c6b6c8e..62df4c7 100644
--- a/datokenizer_test.go
+++ b/datokenizer_test.go
@@ -983,3 +983,7 @@
// BenchmarkTransduce-4 39966 30835 ns/op 8240 B/op 3 allocs/op
// BenchmarkToDoubleArray-4 50720 24863 ns/op 11091 B/op 46 allocs/op
// BenchmarkToDoubleArrayLarger-4 3 432523828 ns/op 6413381 B/op 5122 allocs/op
+// 2021-09-02 - xCheckSkip() with .9
+// BenchmarkTransduce-4 36325 38501 ns/op 8240 B/op 3 allocs/op
+// BenchmarkToDoubleArray-4 66858 19286 ns/op 10607 B/op 29 allocs/op
+// BenchmarkToDoubleArrayLarger-4 18 67428011 ns/op 6360604 B/op 2578 allocs/op