Add ASCII fast lookup to sigma
diff --git a/datokenizer.go b/datokenizer.go
index d5a2303..4f1e12d 100644
--- a/datokenizer.go
+++ b/datokenizer.go
@@ -94,8 +94,8 @@
// DaTokenizer represents a tokenizer implemented as a
// Double Array FSA.
type DaTokenizer struct {
- sigma map[rune]int
- // sigmaList []rune
+ sigma map[rune]int
+ sigmaASCII [256]int
maxSize int
loadFactor float64
array []bc
@@ -509,11 +509,11 @@
dat.resize(dat.final)
- // dat.sigmaList = make([]rune, tok.sigmaCount)
-
for num, sym := range tok.sigmaRev {
+ if int(sym) < 256 {
+ dat.sigmaASCII[int(sym)] = num
+ }
dat.sigma[sym] = num
- // dat.sigmaList[num] = sym
}
mark := 0
@@ -982,18 +982,14 @@
// Shouldn't be relevant though
dat.maxSize = arraySize - 1
- // dat.sigmaList = make([]rune, sigmaCount)
-
for x := 0; x < sigmaCount; x++ {
sym, _, err := r.ReadRune()
if err == nil && sym != 0 {
+ if int(sym) < 256 {
+ dat.sigmaASCII[int(sym)] = x
+ }
dat.sigma[sym] = x
}
- /*
- if err == nil {
- dat.sigmaList[x] = sym
- }
- */
}
_, err = io.ReadFull(r, buf[0:1])
@@ -1048,19 +1044,6 @@
return string(out)
}
-/*
-func (dat *DaTokenizer) LookupSigma(r rune) (int, bool) {
- for i, l := range dat.sigmaList {
- if l == r {
- return i, true
- } else if l > r {
- return 0, false
- }
- }
- return 0, false
-}
-*/
-
// Transduce an input string against the double array
// FSA. The rules are always greedy. If the automaton fails,
// it takes the last possible token ending branch.
@@ -1133,11 +1116,17 @@
}
// TODO: Better not repeatedly check for a!
- a, ok = dat.sigma[char]
- // a, ok = dat.LookupSigma(char)
+ if int(char) < 256 {
+ a = dat.sigmaASCII[int(char)]
+ } else {
+ a, ok = dat.sigma[char]
+ if !ok {
+ a = 0
+ }
+ }
// Use identity symbol if character is not in sigma
- if !ok && dat.identity != -1 {
+ if a == 0 && dat.identity != -1 {
a = dat.identity
}
diff --git a/datokenizer_test.go b/datokenizer_test.go
index e131ce9..d11abbe 100644
--- a/datokenizer_test.go
+++ b/datokenizer_test.go
@@ -871,3 +871,5 @@
// BenchmarkLoadDatokFile-4 8 138937532 ns/op 203158327 B/op 5742 allocs/op
// BenchmarkTransduce-4 22005 48665 ns/op 17472 B/op 3 allocs/op
// BenchmarkLoadDatokFile-4 7 143143934 ns/op 203158450 B/op 5743 allocs/op
+// BenchmarkTransduce-4 34939 34363 ns/op 14056 B/op 3 allocs/op
+// BenchmarkLoadDatokFile-4 7 149511609 ns/op 203217193 B/op 5915 allocs/op