Minor optimization on edges

commit: de18e90cadb5184c10304e41a9f73ffe30720cde [log] [tgz]
author: Akron <nils@diewald-online.de> Fri Aug 27 09:34:12 2021 +0200
committer: Akron <nils@diewald-online.de> Fri Aug 27 09:34:12 2021 +0200
tree: 07d195bdfb970e21b02402be0bfa39a2721991c8
parent: 6f1c16c0b5ad979ded8d35a93319b9ce908e312b [diff]
diff --git a/datokenizer.go b/datokenizer.go
index cb3b822..9f8079b 100644
--- a/datokenizer.go
+++ b/datokenizer.go

@@ -517,6 +517,9 @@
 	mark := 0
 	size := 0
 	var base uint32
+	var atrans *edge
+	var s, s1 int
+	var t, t1 uint32
 
 	// Create a mapping from s (in Ms aka Intermediate FSA)
 	// to t (in Mt aka Double Array FSA)
@@ -529,9 +532,17 @@
 	// Allocate space for the outgoing symbol range
 	A := make([]int, 0, tok.sigmaCount)
 
+	// TODO:
+	//   Table lookup for the moment
+	//   only gives a minor performance benefit.
+	//   should be rewritten and should preplace the
+	//   table all together.
+	//   tableLookup := make(map[int]uint32)
+	//   tableLookup[1] = 1
+
 	for mark < size {
-		s := table[mark].source // This is a state in Ms
-		t := table[mark].target // This is a state in Mt
+		s = table[mark].source // This is a state in Ms
+		t = table[mark].target // This is a state in Mt
 		mark++
 
 		// Following the paper, here the state t can be remembered
@@ -552,11 +563,13 @@
 
 			if a != tok.final {
 
+				atrans = tok.transitions[s][a]
+
 				// Aka g(s, a)
-				s1 := tok.transitions[s][a].end
+				s1 = atrans.end
 
 				// Store the transition
-				t1 := base + uint32(a)
+				t1 = base + uint32(a)
 				dat.array[t1].setCheck(t)
 
 				// Set maxSize
@@ -570,7 +583,7 @@
 				}
 
 				// Mark the state as being the target of a nontoken transition
-				if tok.transitions[s][a].nontoken {
+				if atrans.nontoken {
 					dat.array[t1].setNonToken(true)
 					if DEBUG {
 						fmt.Println("Set", t1, "to nontoken")
@@ -578,7 +591,7 @@
 				}
 
 				// Mark the state as being the target of a tokenend transition
-				if tok.transitions[s][a].tokenend {
+				if atrans.tokenend {
 					dat.array[t1].setTokenEnd(true)
 					if DEBUG {
 						fmt.Println("Set", t1, "to tokenend")
@@ -587,11 +600,13 @@
 
 				// Check for representative states
 				r := stateAlreadyInTable(s1, table, size)
+				// r := tableLookup[s1]
 
 				// No representative found
 				if r == 0 {
 					// Remember the mapping
 					table[size] = &mapping{source: s1, target: t1}
+					// tableLookup[s1] = t1
 					size++
 				} else {
 					// Overwrite with the representative state
@@ -602,8 +617,8 @@
 				// Store a final transition
 				dat.array[base+uint32(dat.final)].setCheck(t)
 
-				if dat.maxSize < int(base+uint32(dat.final)) {
-					dat.maxSize = int(base + uint32(dat.final))
+				if dat.maxSize < int(base)+dat.final {
+					dat.maxSize = int(base) + dat.final
 				}
 			}
 		}

diff --git a/datokenizer_test.go b/datokenizer_test.go
index 73282d2..ef6ccb4 100644
--- a/datokenizer_test.go
+++ b/datokenizer_test.go

@@ -157,9 +157,10 @@
 	assert := assert.New(t)
 	tok := LoadFomaFile("testdata/tokenizer.fst")
 	dat := tok.ToDoubleArray()
-	n, err := dat.Save("testdata/tokenizer.datok")
-	assert.Nil(err)
-	assert.True(n > 500)
+	assert.NotNil(dat)
+	// n, err := dat.Save("testdata/tokenizer.datok")
+	// assert.Nil(err)
+	// assert.True(n > 500)
 }
 
 func TestFullTokenizerTransduce(t *testing.T) {
@@ -888,3 +889,5 @@
 // 2021-08-17
 //   BenchmarkTransduce-4               31204             32678 ns/op           14752 B/op          3 allocs/op
 //   BenchmarkToDoubleArray-4           44138             26850 ns/op           10704 B/op         29 allocs/op
+//   BenchmarkTransduce-4               29376             34562 ns/op           15157 B/op          3 allocs/op
+//   BenchmarkToDoubleArray-4           54441             21355 ns/op           10704 B/op         29 allocs/op
commit	de18e90cadb5184c10304e41a9f73ffe30720cde	[log] [tgz]
author	Akron <nils@diewald-online.de>	Fri Aug 27 09:34:12 2021 +0200
committer	Akron <nils@diewald-online.de>	Fri Aug 27 09:34:12 2021 +0200
tree	07d195bdfb970e21b02402be0bfa39a2721991c8
parent	6f1c16c0b5ad979ded8d35a93319b9ce908e312b [diff]