Turn uint32 array in bc array

commit: f1a165007afc5115cbc09e07911a221126e0d790 [log] [tgz]
author: Akron <nils@diewald-online.de> Mon Aug 16 15:24:38 2021 +0200
committer: Akron <nils@diewald-online.de> Mon Aug 16 15:24:38 2021 +0200
tree: c1c6ae81c777b3050405da9caa903003647cd988
parent: e61380be208de5b3cdef0a20bc518b8a06156ff1 [diff]
diff --git a/datokenizer.go b/datokenizer.go
index 4e870d9..d5a2303 100644
--- a/datokenizer.go
+++ b/datokenizer.go

@@ -20,8 +20,6 @@
 //   looked up in a map or similar.
 // - replace maxSize with the check value
 // - Add checksum to serialization.
-// - Introduce methods on BC array entries instead of
-//   jumping into the entries all the time!
 // - Instead of memoizing the loadFactor, better remember
 //   the number of set transitions
 // - Replace/Enhance table with a map
@@ -88,6 +86,11 @@
 	tokenend int
 }
 
+type bc struct {
+	base  uint32
+	check uint32
+}
+
 // DaTokenizer represents a tokenizer implemented as a
 // Double Array FSA.
 type DaTokenizer struct {
@@ -95,7 +98,7 @@
 	// sigmaList  []rune
 	maxSize    int
 	loadFactor float64
-	array      []uint32
+	array      []bc
 	// lastFilledBase uint32
 
 	// Special symbols in sigma
@@ -504,6 +507,8 @@
 		// lastFilledBase: 1,
 	}
 
+	dat.resize(dat.final)
+
 	// dat.sigmaList = make([]rune, tok.sigmaCount)
 
 	for num, sym := range tok.sigmaRev {
@@ -536,7 +541,7 @@
 		tok.getSet(s, &A)
 
 		// Set base to the first free slot in the double array
-		dat.setBase(t, dat.xCheck(A))
+		dat.array[t].setBase(dat.xCheck(A))
 
 		// TODO:
 		//   Sort the outgoing transitions based on the
@@ -551,8 +556,13 @@
 				s1 := tok.transitions[s][a].end
 
 				// Store the transition
-				t1 := dat.getBase(t) + uint32(a)
-				dat.setCheck(t1, t)
+				t1 := dat.array[t].getBase() + uint32(a)
+				dat.array[t1].setCheck(t)
+
+				// Set maxSize
+				if dat.maxSize < int(t1) {
+					dat.maxSize = int(t1)
+				}
 
 				if DEBUG {
 					fmt.Println("Translate transition",
@@ -561,7 +571,7 @@
 
 				// Mark the state as being the target of a nontoken transition
 				if tok.transitions[s][a].nontoken {
-					dat.setNonToken(t1, true)
+					dat.array[t1].setNonToken(true)
 					if DEBUG {
 						fmt.Println("Set", t1, "to nontoken")
 					}
@@ -569,7 +579,7 @@
 
 				// Mark the state as being the target of a tokenend transition
 				if tok.transitions[s][a].tokenend {
-					dat.setTokenEnd(t1, true)
+					dat.array[t1].setTokenEnd(true)
 					if DEBUG {
 						fmt.Println("Set", t1, "to tokenend")
 					}
@@ -585,18 +595,23 @@
 					size++
 				} else {
 					// Overwrite with the representative state
-					dat.setBase(t1, r)
-					dat.setSeparate(t1, true)
+					dat.array[t1].setBase(r)
+					dat.array[t1].setSeparate(true)
 				}
 			} else {
 				// Store a final transition
-				dat.setCheck(dat.getBase(t)+uint32(dat.final), t)
+				dat.array[dat.array[t].getBase()+uint32(dat.final)].setCheck(t)
+
+				if dat.maxSize < int(dat.array[t].getBase()+uint32(dat.final)) {
+					dat.maxSize = int(dat.array[t].getBase() + uint32(dat.final))
+				}
 			}
 		}
 	}
 
 	// Following Mizobuchi et al (2000) the size of the
 	// FSA should be stored in check(1).
+	// We make the size a bit smaller so we never have to check for boundaries.
 	dat.setSize(dat.maxSize + 1)
 	dat.array = dat.array[:dat.maxSize+1]
 	return dat
@@ -620,96 +635,80 @@
 	// TODO:
 	//   This is a bit too aggressive atm and should be calmed down.
 	if len(dat.array) <= l {
-		dat.array = append(dat.array, make([]uint32, l)...)
+		dat.array = append(dat.array, make([]bc, l)...)
 	}
 }
 
 // Set base value in double array
-func (dat *DaTokenizer) setBase(p uint32, v uint32) {
-	l := int(p*2 + 1)
-	if dat.maxSize < l {
-		dat.resize(l)
-		dat.maxSize = l
-	}
-	dat.array[l-1] = v
+func (bc *bc) setBase(v uint32) {
+	bc.base = v
 }
 
 // Get base value in double array
-func (dat *DaTokenizer) getBase(p uint32) uint32 {
-	if int(p*2) > dat.maxSize {
-		return 0
-	}
-	return dat.array[p*2] & RESTBIT
+func (bc *bc) getBase() uint32 {
+	return bc.base & RESTBIT
 }
 
 // Set check value in double array
-func (dat *DaTokenizer) setCheck(p uint32, v uint32) {
-	l := int(p*2 + 1)
-	if dat.maxSize < l {
-		dat.resize(l)
-		dat.maxSize = l
-	}
-	dat.array[l] = v
+func (bc *bc) setCheck(v uint32) {
+	bc.check = v
 }
 
 // Get check value in double array
-func (dat *DaTokenizer) getCheck(p uint32) uint32 {
-	if int((p*2)+1) > dat.maxSize {
-		return 0
-	}
-	return dat.array[(p*2)+1] & RESTBIT
+func (bc *bc) getCheck() uint32 {
+	return bc.check & RESTBIT
 }
 
 // Returns true if a state is separate pointing to a representative
-func (dat *DaTokenizer) isSeparate(p uint32) bool {
-	return dat.array[p*2]&FIRSTBIT != 0
+func (bc *bc) isSeparate() bool {
+	return bc.base&FIRSTBIT != 0
 }
 
 // Mark a state as separate pointing to a representative
-func (dat *DaTokenizer) setSeparate(p uint32, sep bool) {
+func (bc *bc) setSeparate(sep bool) {
 	if sep {
-		dat.array[p*2] |= FIRSTBIT
+		bc.base |= FIRSTBIT
 	} else {
-		dat.array[p*2] &= (RESTBIT | SECONDBIT)
+		bc.base &= (RESTBIT | SECONDBIT)
 	}
 }
 
 // Returns true if a state is the target of a nontoken transition
-func (dat *DaTokenizer) isNonToken(p uint32) bool {
-	return dat.array[p*2+1]&FIRSTBIT != 0
+func (bc *bc) isNonToken() bool {
+	return bc.check&FIRSTBIT != 0
 }
 
 // Mark a state as being the target of a nontoken transition
-func (dat *DaTokenizer) setNonToken(p uint32, sep bool) {
+func (bc *bc) setNonToken(sep bool) {
 	if sep {
-		dat.array[p*2+1] |= FIRSTBIT
+		bc.check |= FIRSTBIT
 	} else {
-		dat.array[p*2+1] &= (RESTBIT | SECONDBIT)
+		bc.check &= (RESTBIT | SECONDBIT)
 	}
 }
 
 // Returns true if a state is the target of a tokenend transition
-func (dat *DaTokenizer) isTokenEnd(p uint32) bool {
-	return dat.array[p*2+1]&SECONDBIT != 0
+func (bc *bc) isTokenEnd() bool {
+	return bc.check&SECONDBIT != 0
 }
 
 // Mark a state as being the target of a tokenend transition
-func (dat *DaTokenizer) setTokenEnd(p uint32, sep bool) {
+func (bc *bc) setTokenEnd(sep bool) {
 	if sep {
-		dat.array[p*2+1] |= SECONDBIT
+		bc.check |= SECONDBIT
 	} else {
-		dat.array[p*2+1] &= (RESTBIT | FIRSTBIT)
+		bc.check &= (RESTBIT | FIRSTBIT)
 	}
 }
 
 // Set size of double array
 func (dat *DaTokenizer) setSize(v int) {
-	dat.setCheck(1, uint32(v))
+	dat.array[1].setCheck(uint32(v))
 }
 
 // Get size of double array
 func (dat *DaTokenizer) GetSize() int {
-	return int(dat.getCheck(1))
+	return int(dat.array[1].getCheck())
 }
 
 // Based on Mizobuchi et al (2000), p. 124
@@ -735,9 +734,9 @@
 	*/
 
 	// Resize the array if necessary
-	dat.resize((int(base) + dat.final) * 2)
+	dat.resize(int(base) + dat.final)
 	for _, a := range symbols {
-		if dat.getCheck(base+uint32(a)) != 0 {
+		if dat.array[int(base)+a].getCheck() != 0 {
 			base++
 			goto OVERLAP
 		}
@@ -752,15 +751,15 @@
 	valid := make([]int, 0, len(dat.sigma))
 
 	for _, a := range dat.sigma {
-		t1 := dat.getBase(t) + uint32(a)
-		if t1 <= dat.getCheck(1) && dat.getCheck(t1) == t {
+		t1 := dat.array[t].getBase() + uint32(a)
+		if t1 <= dat.array[1].getCheck() && dat.array[t1].getCheck() == t {
 			valid = append(valid, a)
 		}
 	}
 
 	for _, a := range []int{dat.epsilon, dat.unknown, dat.identity, dat.final} {
-		t1 := dat.getBase(t) + uint32(a)
-		if t1 <= dat.getCheck(1) && dat.getCheck(t1) == t {
+		t1 := dat.array[t].getBase() + uint32(a)
+		if t1 <= dat.array[1].getCheck() && dat.array[t1].getCheck() == t {
 			valid = append(valid, -1*a)
 		}
 	}
@@ -780,8 +779,8 @@
 	}
 	nonEmpty := 0
 	all := len(dat.array) / 2
-	for x := 1; x <= len(dat.array); x = x + 2 {
-		if dat.array[x] != 0 {
+	for x := 1; x < len(dat.array); x++ {
+		if dat.array[x].getBase() != 0 {
 			nonEmpty++
 		}
 	}
@@ -840,7 +839,7 @@
 	bo.PutUint16(buf[6:8], uint16(dat.identity))
 	bo.PutUint16(buf[8:10], uint16(dat.final))
 	bo.PutUint16(buf[10:12], uint16(len(sigmalist)))
-	bo.PutUint32(buf[12:16], uint32(len(dat.array)))
+	bo.PutUint32(buf[12:16], uint32(len(dat.array)*2)) // Legacy support
 	more, err := wb.Write(buf[0:16])
 	if err != nil {
 		log.Println(err)
@@ -873,17 +872,28 @@
 	}
 	all += more
 
-	for x := 0; x < len(dat.array); x++ {
-		//	for _, d := range dat.array {
-		bo.PutUint32(buf[0:4], dat.array[x])
-		more, err := wb.Write(buf[0:4])
+	// for x := 0; x < len(dat.array); x++ {
+	for _, bc := range dat.array {
+		bo.PutUint32(buf[0:4], bc.base)
+		more, err = wb.Write(buf[0:4])
 		if err != nil {
 			log.Println(err)
 			return int64(all), err
 		}
 		all += more
 		if more != 4 {
-			log.Println("Can not write uint32")
+			log.Println("Can not write base uint32")
+			return int64(all), err
+		}
+		bo.PutUint32(buf[0:4], bc.check)
+		more, err = wb.Write(buf[0:4])
+		if err != nil {
+			log.Println(err)
+			return int64(all), err
+		}
+		all += more
+		if more != 4 {
+			log.Println("Can not write check uint32")
 			return int64(all), err
 		}
 	}
@@ -967,7 +977,7 @@
 	dat.final = int(bo.Uint16(buf[8:10]))
 
 	sigmaCount := int(bo.Uint16(buf[10:12]))
-	arraySize := int(bo.Uint32(buf[12:16]))
+	arraySize := int(bo.Uint32(buf[12:16])) / 2 // Legacy support
 
 	// Shouldn't be relevant though
 	dat.maxSize = arraySize - 1
@@ -999,7 +1009,7 @@
 	}
 
 	// Read based on length
-	dat.array = make([]uint32, arraySize)
+	dat.array = make([]bc, arraySize)
 
 	dataArray, err := io.ReadAll(r)
 
@@ -1008,13 +1018,14 @@
 		return nil
 	}
 
-	if len(dataArray) < arraySize*4 {
+	if len(dataArray) < arraySize*8 {
 		log.Println("Not enough bytes read")
 		return nil
 	}
 
 	for x := 0; x < arraySize; x++ {
-		dat.array[x] = bo.Uint32(dataArray[x*4 : (x*4)+4])
+		dat.array[x].base = bo.Uint32(dataArray[x*8 : (x*8)+4])
+		dat.array[x].check = bo.Uint32(dataArray[(x*8)+4 : (x*8)+8])
 	}
 
 	return dat
@@ -1133,7 +1144,7 @@
 			t0 = t
 
 			// Check for epsilon transitions and remember
-			if dat.getCheck(dat.getBase(t0)+uint32(dat.epsilon)) == t0 {
+			if dat.array[dat.array[t0].getBase()+uint32(dat.epsilon)].getCheck() == t0 {
 				// Remember state for backtracking to last tokenend state
 				epsilonState = t0
 				epsilonOffset = buffo
@@ -1141,7 +1152,8 @@
 		}
 
 		// Checks a transition based on t0, a and buffo
-		t = dat.getBase(t0) + uint32(a)
+		t = dat.array[t0].getBase() + uint32(a)
+		ta := dat.array[t]
 
 		if DEBUG {
 			// Char is only relevant if set
@@ -1152,10 +1164,10 @@
 		}
 
 		// Check if the transition is invalid according to the double array
-		if t > dat.getCheck(1) || dat.getCheck(t) != t0 {
+		if t > dat.array[1].getCheck() || ta.getCheck() != t0 {
 
 			if DEBUG {
-				fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", t0)
+				fmt.Println("Match is not fine!", t, "and", ta.getCheck(), "vs", t0)
 			}
 
 			if !ok && a == dat.identity {
@@ -1196,7 +1208,7 @@
 			buffo++
 
 			// Transition does not produce a character
-			if buffo == 1 && dat.isNonToken(t) {
+			if buffo == 1 && ta.isNonToken() {
 				if DEBUG {
 					fmt.Println("Nontoken forward", showBuffer(buffer, buffo, buffi))
 				}
@@ -1205,7 +1217,7 @@
 		}
 
 		// Transition marks the end of a token - so flush the buffer
-		if dat.isTokenEnd(t) {
+		if ta.isTokenEnd() {
 
 			if buffi > 0 {
 				if DEBUG {
@@ -1237,8 +1249,9 @@
 		}
 
 		// Move to representative state
-		if dat.isSeparate(t) {
-			t = dat.getBase(t)
+		if ta.isSeparate() {
+			t = ta.getBase()
+			ta = dat.array[t]
 
 			if DEBUG {
 				fmt.Println("Representative pointing to", t)
@@ -1264,7 +1277,9 @@
 	}
 
 	// Automaton is in a final state, so flush the buffer and return
-	if dat.getCheck(dat.getBase(t)+uint32(dat.final)) == t {
+	x := dat.array[t].getBase() + uint32(dat.final)
+
+	if x < dat.array[1].getCheck() && dat.array[x].getCheck() == t {
 
 		if buffi > 0 {
 			if DEBUG {
@@ -1272,7 +1287,7 @@
 			}
 			writer.WriteString(string(buffer[:buffi]))
 
-			if dat.isTokenEnd(t) {
+			if dat.array[t].isTokenEnd() {
 				writer.WriteRune('\n')
 				if DEBUG {
 					fmt.Println("-> Newline")
@@ -1283,7 +1298,7 @@
 		// Add an additional sentence ending, if the file is over but no explicit
 		// sentence split was reached. This may be controversial and therefore
 		// optional via parameter.
-		if !dat.isTokenEnd(t0) {
+		if !dat.array[t0].isTokenEnd() {
 			writer.WriteRune('\n')
 			if DEBUG {
 				fmt.Println("-> Newline")
@@ -1298,10 +1313,10 @@
 
 	// Check epsilon transitions until a final state is reached
 	t0 = t
-	t = dat.getBase(t0) + uint32(dat.epsilon)
+	t = dat.array[t0].getBase() + uint32(dat.epsilon)
 	a = dat.epsilon
 	newchar = false
-	if dat.getCheck(t) == t0 {
+	if dat.array[t].getCheck() == t0 {
 		// Remember state for backtracking to last tokenend state
 		goto PARSECHAR
 

diff --git a/datokenizer_test.go b/datokenizer_test.go
index ade17cf..e131ce9 100644
--- a/datokenizer_test.go
+++ b/datokenizer_test.go

@@ -146,9 +146,8 @@
 	assert.Equal(dat.identity, 3)
 	assert.Equal(dat.final, 137)
 	assert.Equal(len(dat.sigma), 132)
-	assert.True(len(dat.array) > 3800000)
-	assert.True(dat.maxSize > 3800000)
-
+	assert.True(len(dat.array) > 3600000)
+	assert.True(dat.maxSize > 3600000)
 	assert.True(tmatch(dat, "bau"))
 	assert.True(tmatch(dat, "bad"))
 	assert.True(tmatch(dat, "wald gehen"))
@@ -862,8 +861,13 @@
 // 2021-08-11 (go 1.16)
 // go test -bench=. -test.benchmem
 //   BenchmarkTransduce-4         19069             60609 ns/op           11048 B/op        137 allocs/op
-// 2021-08-112 (go 1.16)
+// 2021-08-12 (go 1.16)
 //   BenchmarkTransduce-4         20833             55241 ns/op            9676 B/op          3 allocs/op
 //   BenchmarkLoadDatokFile-4         4         258418169 ns/op        29916470 B/op       5697 allocs/op
 //   BenchmarkTransduce-4         19430             58133 ns/op           18696 B/op          3 allocs/op
 //   BenchmarkLoadDatokFile-4         8         139071939 ns/op       203158377 B/op       5742 allocs/op
+// 2021-08-16
+//   BenchmarkTransduce-4               22251             49989 ns/op           17370 B/op          3 allocs/op
+//   BenchmarkLoadDatokFile-4               8         138937532 ns/op        203158327 B/op      5742 allocs/op
+//   BenchmarkTransduce-4               22005             48665 ns/op           17472 B/op          3 allocs/op
+//   BenchmarkLoadDatokFile-4               7         143143934 ns/op        203158450 B/op      5743 allocs/op
commit	f1a165007afc5115cbc09e07911a221126e0d790	[log] [tgz]
author	Akron <nils@diewald-online.de>	Mon Aug 16 15:24:38 2021 +0200
committer	Akron <nils@diewald-online.de>	Mon Aug 16 15:24:38 2021 +0200
tree	c1c6ae81c777b3050405da9caa903003647cd988
parent	e61380be208de5b3cdef0a20bc518b8a06156ff1 [diff]