Add xCheck() improvement proposed by Niu (2013)
diff --git a/datokenizer.go b/datokenizer.go
index ee9822c..a213b33 100644
--- a/datokenizer.go
+++ b/datokenizer.go
@@ -15,6 +15,10 @@
// TODO:
// - replace maxSize with the check value
+// - Check if final states can be optional.
+// - Introduce ELM (Morita et al. 2001) to speed
+// up construction. Can be ignored on serialization
+// to improve compression.
// - Add checksum to serialization.
// - Replace/Enhance table with a map
// - Provide a bufio.Scanner compatible interface.
@@ -541,25 +545,25 @@
// Create a mapping from s (in Ms aka Intermediate FSA)
// to t (in Mt aka Double Array FSA)
table := make([]*mapping, tok.arcCount+1)
+ // tableQueue := make([]int, tok.arcCount+1)
// Initialize with the start state
table[size] = &mapping{source: 1, target: 1}
+ // tableQueue[size] = 1
size++
// Allocate space for the outgoing symbol range
A := make([]int, 0, tok.sigmaCount)
+ // tableLookup := make([]uint32, tok.arcCount+2) // make(map[int]uint32)
+ // tableLookup[1] = 1
- // TODO:
- // Table lookup for the moment
- // only gives a minor performance benefit.
- // should be rewritten and should preplace the
- // table all together.
- // tableLookup := make(map[int]uint32)
- // tableLookup[1] = 1
+ // block_begin_pos := uint32(1)
for mark < size {
s = table[mark].source // This is a state in Ms
t = table[mark].target // This is a state in Mt
+ // s = tableQueue[mark]
+ // t = tableLookup[s]
mark++
// Following the paper, here the state t can be remembered
@@ -569,6 +573,7 @@
// Set base to the first free slot in the double array
base = dat.xCheck(A)
+ // base = dat.xCheckNiu(A, &block_begin_pos)
dat.array[t].setBase(base)
// TODO:
@@ -623,6 +628,7 @@
if r == 0 {
// Remember the mapping
table[size] = &mapping{source: s1, target: t1}
+ // tableQueue[size] = s1
// tableLookup[s1] = t1
size++
} else {
@@ -765,6 +771,34 @@
return base
}
+// This is an implementation of xCheck wit an improvement
+// proposed by Niu et al. (2013)
+func (dat *DaTokenizer) xCheckNiu(symbols []int, block_begin_pos *uint32) uint32 {
+
+ // Start at the first entry of the double array list
+ base := uint32(1)
+
+ if len(symbols) > 3 {
+ sort.Ints(symbols)
+ if *block_begin_pos > uint32(symbols[0]) {
+ dat.resize(int(*block_begin_pos) + dat.final)
+ *block_begin_pos += uint32(symbols[len(symbols)-1] + 1)
+ return *block_begin_pos - uint32(symbols[0])
+ }
+ }
+
+OVERLAP:
+ // Resize the array if necessary
+ dat.resize(int(base) + dat.final)
+ for _, a := range symbols {
+ if dat.array[int(base)+a].getCheck() != 0 {
+ base++
+ goto OVERLAP
+ }
+ }
+ return base
+}
+
// List all outgoing transitions for a state
// for testing purposes
func (dat *DaTokenizer) outgoing(t uint32) []int {
diff --git a/datokenizer_test.go b/datokenizer_test.go
index 9fa44b5..c6b6c8e 100644
--- a/datokenizer_test.go
+++ b/datokenizer_test.go
@@ -941,6 +941,18 @@
}
}
+func BenchmarkToDoubleArrayLarger(b *testing.B) {
+ tok := LoadFomaFile("testdata/abbr_bench.fst")
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ dat := tok.ToDoubleArray()
+ if dat == nil {
+ fmt.Println("Fail!")
+ os.Exit(1)
+ }
+ }
+}
+
// 2021-08-11 (go 1.16)
// go test -bench=. -test.benchmem
// BenchmarkTransduce-4 19069 60609 ns/op 11048 B/op 137 allocs/op
@@ -962,5 +974,12 @@
// BenchmarkTransduce-4 29376 34562 ns/op 15157 B/op 3 allocs/op
// BenchmarkToDoubleArray-4 54441 21355 ns/op 10704 B/op 29 allocs/op
// 2021-09-02 - New tokenizer - fixed loading
-// BenchmarkTransduce-4 44354 27466 ns/op 8240 B/op 3 allocs/op
-// BenchmarkToDoubleArray-4 40719 25515 ns/op 10705 B/op 29 allocs/op
+// BenchmarkTransduce-4 40149 31515 ns/op 8240 B/op 3 allocs/op
+// BenchmarkToDoubleArray-4 51043 22586 ns/op 10702 B/op 29 allocs/op
+// BenchmarkToDoubleArrayLarger-4 3 396009639 ns/op 6352293 B/op 2575 allocs/op
+// BenchmarkTransduce-4 38698 31900 ns/op 8240 B/op 3 allocs/op
+// BenchmarkToDoubleArray-4 50644 21569 ns/op 11151 B/op 14 allocs/op
+// BenchmarkToDoubleArrayLarger-4 3 441260766 ns/op 6942336 B/op 30 allocs/op
+// BenchmarkTransduce-4 39966 30835 ns/op 8240 B/op 3 allocs/op
+// BenchmarkToDoubleArray-4 50720 24863 ns/op 11091 B/op 46 allocs/op
+// BenchmarkToDoubleArrayLarger-4 3 432523828 ns/op 6413381 B/op 5122 allocs/op
diff --git a/testdata/abbr_bench.fst b/testdata/abbr_bench.fst
new file mode 100644
index 0000000..97e3380
--- /dev/null
+++ b/testdata/abbr_bench.fst
Binary files differ