Added benchmark for double array creation

commit: 6f1c16c0b5ad979ded8d35a93319b9ce908e312b [log] [tgz]
author: Akron <nils@diewald-online.de> Tue Aug 17 10:45:42 2021 +0200
committer: Akron <nils@diewald-online.de> Tue Aug 17 10:45:42 2021 +0200
tree: 9f0fe1395a8b687686c1a77c0fe612de35de5e15
parent: 3de361e2df232b7770db6175de50f5eca314e33b [diff]
diff --git a/datokenizer.go b/datokenizer.go
index 4f1e12d..cb3b822 100644
--- a/datokenizer.go
+++ b/datokenizer.go

@@ -99,7 +99,6 @@
 	maxSize    int
 	loadFactor float64
 	array      []bc
-	// lastFilledBase uint32
 
 	// Special symbols in sigma
 	epsilon  int
@@ -504,7 +503,6 @@
 		identity:   tok.identity,
 		epsilon:    tok.epsilon,
 		tokenend:   tok.tokenend,
-		// lastFilledBase: 1,
 	}
 
 	dat.resize(dat.final)
@@ -518,6 +516,7 @@
 
 	mark := 0
 	size := 0
+	var base uint32
 
 	// Create a mapping from s (in Ms aka Intermediate FSA)
 	// to t (in Mt aka Double Array FSA)
@@ -541,7 +540,8 @@
 		tok.getSet(s, &A)
 
 		// Set base to the first free slot in the double array
-		dat.array[t].setBase(dat.xCheck(A))
+		base = dat.xCheck(A)
+		dat.array[t].setBase(base)
 
 		// TODO:
 		//   Sort the outgoing transitions based on the
@@ -556,7 +556,7 @@
 				s1 := tok.transitions[s][a].end
 
 				// Store the transition
-				t1 := dat.array[t].getBase() + uint32(a)
+				t1 := base + uint32(a)
 				dat.array[t1].setCheck(t)
 
 				// Set maxSize
@@ -600,10 +600,10 @@
 				}
 			} else {
 				// Store a final transition
-				dat.array[dat.array[t].getBase()+uint32(dat.final)].setCheck(t)
+				dat.array[base+uint32(dat.final)].setCheck(t)
 
-				if dat.maxSize < int(dat.array[t].getBase()+uint32(dat.final)) {
-					dat.maxSize = int(dat.array[t].getBase() + uint32(dat.final))
+				if dat.maxSize < int(base+uint32(dat.final)) {
+					dat.maxSize = int(base + uint32(dat.final))
 				}
 			}
 		}
@@ -719,20 +719,9 @@
 func (dat *DaTokenizer) xCheck(symbols []int) uint32 {
 
 	// Start at the first entry of the double array list
-	base := uint32(1) // dat.lastFilledBase
-	// skip := false
+	base := uint32(1)
+
 OVERLAP:
-
-	/*
-		if !skip {
-			if dat.getCheck(base) != 0 {
-				dat.lastFilledBase = base
-			} else {
-				skip = true
-			}
-		}
-	*/
-
 	// Resize the array if necessary
 	dat.resize(int(base) + dat.final)
 	for _, a := range symbols {
@@ -1115,7 +1104,9 @@
 				fmt.Println("Current char", string(char), showBuffer(buffer, buffo, buffi))
 			}
 
-			// TODO: Better not repeatedly check for a!
+			// TODO:
+			//   Better not repeatedly check for a!
+			//   Possibly keep a buffer with a.
 			if int(char) < 256 {
 				a = dat.sigmaASCII[int(char)]
 			} else {

diff --git a/datokenizer_test.go b/datokenizer_test.go
index d11abbe..73282d2 100644
--- a/datokenizer_test.go
+++ b/datokenizer_test.go

@@ -848,7 +848,8 @@
 	}
 }
 
-func BenchmarkLoadDatokFile(b *testing.B) {
+// This test is deprecated as the datok file changes over time
+func XBenchmarkLoadDatokFile(b *testing.B) {
 	for i := 0; i < b.N; i++ {
 		dat := LoadDatokFile("testdata/tokenizer.datok")
 		if dat == nil {
@@ -858,6 +859,17 @@
 	}
 }
 
+func BenchmarkToDoubleArray(b *testing.B) {
+	tok := LoadFomaFile("testdata/simple_bench.fst")
+	for i := 0; i < b.N; i++ {
+		dat := tok.ToDoubleArray()
+		if dat == nil {
+			fmt.Println("Fail!")
+			os.Exit(1)
+		}
+	}
+}
+
 // 2021-08-11 (go 1.16)
 // go test -bench=. -test.benchmem
 //   BenchmarkTransduce-4         19069             60609 ns/op           11048 B/op        137 allocs/op
@@ -873,3 +885,6 @@
 //   BenchmarkLoadDatokFile-4               7         143143934 ns/op        203158450 B/op      5743 allocs/op
 //   BenchmarkTransduce-4               34939             34363 ns/op           14056 B/op          3 allocs/op
 //   BenchmarkLoadDatokFile-4               7         149511609 ns/op        203217193 B/op      5915 allocs/op
+// 2021-08-17
+//   BenchmarkTransduce-4               31204             32678 ns/op           14752 B/op          3 allocs/op
+//   BenchmarkToDoubleArray-4           44138             26850 ns/op           10704 B/op         29 allocs/op

diff --git a/testdata/simple_bench.fst b/testdata/simple_bench.fst
new file mode 100644
index 0000000..d399284
--- /dev/null
+++ b/testdata/simple_bench.fst
Binary files differ
commit	6f1c16c0b5ad979ded8d35a93319b9ce908e312b	[log] [tgz]
author	Akron <nils@diewald-online.de>	Tue Aug 17 10:45:42 2021 +0200
committer	Akron <nils@diewald-online.de>	Tue Aug 17 10:45:42 2021 +0200
tree	9f0fe1395a8b687686c1a77c0fe612de35de5e15
parent	3de361e2df232b7770db6175de50f5eca314e33b [diff]