Added benchmark for double array creation
diff --git a/datokenizer.go b/datokenizer.go
index 4f1e12d..cb3b822 100644
--- a/datokenizer.go
+++ b/datokenizer.go
@@ -99,7 +99,6 @@
maxSize int
loadFactor float64
array []bc
- // lastFilledBase uint32
// Special symbols in sigma
epsilon int
@@ -504,7 +503,6 @@
identity: tok.identity,
epsilon: tok.epsilon,
tokenend: tok.tokenend,
- // lastFilledBase: 1,
}
dat.resize(dat.final)
@@ -518,6 +516,7 @@
mark := 0
size := 0
+ var base uint32
// Create a mapping from s (in Ms aka Intermediate FSA)
// to t (in Mt aka Double Array FSA)
@@ -541,7 +540,8 @@
tok.getSet(s, &A)
// Set base to the first free slot in the double array
- dat.array[t].setBase(dat.xCheck(A))
+ base = dat.xCheck(A)
+ dat.array[t].setBase(base)
// TODO:
// Sort the outgoing transitions based on the
@@ -556,7 +556,7 @@
s1 := tok.transitions[s][a].end
// Store the transition
- t1 := dat.array[t].getBase() + uint32(a)
+ t1 := base + uint32(a)
dat.array[t1].setCheck(t)
// Set maxSize
@@ -600,10 +600,10 @@
}
} else {
// Store a final transition
- dat.array[dat.array[t].getBase()+uint32(dat.final)].setCheck(t)
+ dat.array[base+uint32(dat.final)].setCheck(t)
- if dat.maxSize < int(dat.array[t].getBase()+uint32(dat.final)) {
- dat.maxSize = int(dat.array[t].getBase() + uint32(dat.final))
+ if dat.maxSize < int(base+uint32(dat.final)) {
+ dat.maxSize = int(base + uint32(dat.final))
}
}
}
@@ -719,20 +719,9 @@
func (dat *DaTokenizer) xCheck(symbols []int) uint32 {
// Start at the first entry of the double array list
- base := uint32(1) // dat.lastFilledBase
- // skip := false
+ base := uint32(1)
+
OVERLAP:
-
- /*
- if !skip {
- if dat.getCheck(base) != 0 {
- dat.lastFilledBase = base
- } else {
- skip = true
- }
- }
- */
-
// Resize the array if necessary
dat.resize(int(base) + dat.final)
for _, a := range symbols {
@@ -1115,7 +1104,9 @@
fmt.Println("Current char", string(char), showBuffer(buffer, buffo, buffi))
}
- // TODO: Better not repeatedly check for a!
+ // TODO:
+ // Better not repeatedly check for a!
+ // Possibly keep a buffer with a.
if int(char) < 256 {
a = dat.sigmaASCII[int(char)]
} else {
diff --git a/datokenizer_test.go b/datokenizer_test.go
index d11abbe..73282d2 100644
--- a/datokenizer_test.go
+++ b/datokenizer_test.go
@@ -848,7 +848,8 @@
}
}
-func BenchmarkLoadDatokFile(b *testing.B) {
+// This test is deprecated as the datok file changes over time
+func XBenchmarkLoadDatokFile(b *testing.B) {
for i := 0; i < b.N; i++ {
dat := LoadDatokFile("testdata/tokenizer.datok")
if dat == nil {
@@ -858,6 +859,17 @@
}
}
+func BenchmarkToDoubleArray(b *testing.B) {
+ tok := LoadFomaFile("testdata/simple_bench.fst")
+ for i := 0; i < b.N; i++ {
+ dat := tok.ToDoubleArray()
+ if dat == nil {
+ fmt.Println("Fail!")
+ os.Exit(1)
+ }
+ }
+}
+
// 2021-08-11 (go 1.16)
// go test -bench=. -test.benchmem
// BenchmarkTransduce-4 19069 60609 ns/op 11048 B/op 137 allocs/op
@@ -873,3 +885,6 @@
// BenchmarkLoadDatokFile-4 7 143143934 ns/op 203158450 B/op 5743 allocs/op
// BenchmarkTransduce-4 34939 34363 ns/op 14056 B/op 3 allocs/op
// BenchmarkLoadDatokFile-4 7 149511609 ns/op 203217193 B/op 5915 allocs/op
+// 2021-08-17
+// BenchmarkTransduce-4 31204 32678 ns/op 14752 B/op 3 allocs/op
+// BenchmarkToDoubleArray-4 44138 26850 ns/op 10704 B/op 29 allocs/op
diff --git a/testdata/simple_bench.fst b/testdata/simple_bench.fst
new file mode 100644
index 0000000..d399284
--- /dev/null
+++ b/testdata/simple_bench.fst
Binary files differ