Add xCheck() improvement proposed by Niu (2013)
diff --git a/datokenizer.go b/datokenizer.go
index ee9822c..a213b33 100644
--- a/datokenizer.go
+++ b/datokenizer.go
@@ -15,6 +15,10 @@
 
 // TODO:
 // - replace maxSize with the check value
+// - Check if final states can be optional.
+// - Introduce ELM (Morita et al. 2001) to speed
+//   up construction. Can be ignored on serialization
+//   to improve compression.
 // - Add checksum to serialization.
 // - Replace/Enhance table with a map
 // - Provide a bufio.Scanner compatible interface.
@@ -541,25 +545,25 @@
 	// Create a mapping from s (in Ms aka Intermediate FSA)
 	// to t (in Mt aka Double Array FSA)
 	table := make([]*mapping, tok.arcCount+1)
+	// tableQueue := make([]int, tok.arcCount+1)
 
 	// Initialize with the start state
 	table[size] = &mapping{source: 1, target: 1}
+	// tableQueue[size] = 1
 	size++
 
 	// Allocate space for the outgoing symbol range
 	A := make([]int, 0, tok.sigmaCount)
+	// tableLookup := make([]uint32, tok.arcCount+2) // make(map[int]uint32)
+	// tableLookup[1] = 1
 
-	// TODO:
-	//   Table lookup for the moment
-	//   only gives a minor performance benefit.
-	//   should be rewritten and should preplace the
-	//   table all together.
-	//   tableLookup := make(map[int]uint32)
-	//   tableLookup[1] = 1
+	// block_begin_pos := uint32(1)
 
 	for mark < size {
 		s = table[mark].source // This is a state in Ms
 		t = table[mark].target // This is a state in Mt
+		// s = tableQueue[mark]
+		// t = tableLookup[s]
 		mark++
 
 		// Following the paper, here the state t can be remembered
@@ -569,6 +573,7 @@
 
 		// Set base to the first free slot in the double array
 		base = dat.xCheck(A)
+		// base = dat.xCheckNiu(A, &block_begin_pos)
 		dat.array[t].setBase(base)
 
 		// TODO:
@@ -623,6 +628,7 @@
 				if r == 0 {
 					// Remember the mapping
 					table[size] = &mapping{source: s1, target: t1}
+					// tableQueue[size] = s1
 					// tableLookup[s1] = t1
 					size++
 				} else {
@@ -765,6 +771,34 @@
 	return base
 }
 
+// This is an implementation of xCheck wit an improvement
+// proposed by Niu et al. (2013)
+func (dat *DaTokenizer) xCheckNiu(symbols []int, block_begin_pos *uint32) uint32 {
+
+	// Start at the first entry of the double array list
+	base := uint32(1)
+
+	if len(symbols) > 3 {
+		sort.Ints(symbols)
+		if *block_begin_pos > uint32(symbols[0]) {
+			dat.resize(int(*block_begin_pos) + dat.final)
+			*block_begin_pos += uint32(symbols[len(symbols)-1] + 1)
+			return *block_begin_pos - uint32(symbols[0])
+		}
+	}
+
+OVERLAP:
+	// Resize the array if necessary
+	dat.resize(int(base) + dat.final)
+	for _, a := range symbols {
+		if dat.array[int(base)+a].getCheck() != 0 {
+			base++
+			goto OVERLAP
+		}
+	}
+	return base
+}
+
 // List all outgoing transitions for a state
 // for testing purposes
 func (dat *DaTokenizer) outgoing(t uint32) []int {
diff --git a/datokenizer_test.go b/datokenizer_test.go
index 9fa44b5..c6b6c8e 100644
--- a/datokenizer_test.go
+++ b/datokenizer_test.go
@@ -941,6 +941,18 @@
 	}
 }
 
+func BenchmarkToDoubleArrayLarger(b *testing.B) {
+	tok := LoadFomaFile("testdata/abbr_bench.fst")
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		dat := tok.ToDoubleArray()
+		if dat == nil {
+			fmt.Println("Fail!")
+			os.Exit(1)
+		}
+	}
+}
+
 // 2021-08-11 (go 1.16)
 // go test -bench=. -test.benchmem
 //   BenchmarkTransduce-4         19069             60609 ns/op           11048 B/op        137 allocs/op
@@ -962,5 +974,12 @@
 //   BenchmarkTransduce-4               29376             34562 ns/op           15157 B/op          3 allocs/op
 //   BenchmarkToDoubleArray-4           54441             21355 ns/op           10704 B/op         29 allocs/op
 // 2021-09-02 - New tokenizer - fixed loading
-//   BenchmarkTransduce-4               44354             27466 ns/op            8240 B/op          3 allocs/op
-//   BenchmarkToDoubleArray-4           40719             25515 ns/op           10705 B/op         29 allocs/op
+//   BenchmarkTransduce-4                       40149             31515 ns/op            8240 B/op          3 allocs/op
+//   BenchmarkToDoubleArray-4                   51043             22586 ns/op           10702 B/op         29 allocs/op
+//   BenchmarkToDoubleArrayLarger-4                 3         396009639 ns/op         6352293 B/op       2575 allocs/op
+//   BenchmarkTransduce-4                       38698             31900 ns/op            8240 B/op          3 allocs/op
+//   BenchmarkToDoubleArray-4                   50644             21569 ns/op           11151 B/op         14 allocs/op
+//   BenchmarkToDoubleArrayLarger-4                 3         441260766 ns/op         6942336 B/op         30 allocs/op
+//   BenchmarkTransduce-4                       39966             30835 ns/op            8240 B/op          3 allocs/op
+//   BenchmarkToDoubleArray-4                   50720             24863 ns/op           11091 B/op         46 allocs/op
+//   BenchmarkToDoubleArrayLarger-4                 3         432523828 ns/op         6413381 B/op       5122 allocs/op
diff --git a/testdata/abbr_bench.fst b/testdata/abbr_bench.fst
new file mode 100644
index 0000000..97e3380
--- /dev/null
+++ b/testdata/abbr_bench.fst
Binary files differ