Improve offset handling in buffers
Change-Id: I1f66695a852fda1c1bd8fb1fdd418c5ecda54f66
diff --git a/datok.go b/datok.go
index 7b85809..45a9a19 100644
--- a/datok.go
+++ b/datok.go
@@ -724,6 +724,27 @@
return string(out)
}
+// Show the current state of the buffer,
+// for testing puroses
+func showBufferNew(buffer []rune, bufft int, buffc int, buffi int) string {
+ out := make([]rune, 0, 1024)
+ for x := 0; x < len(buffer); x++ {
+ if buffi == x {
+ out = append(out, '^')
+ }
+ if bufft == x {
+ out = append(out, '|')
+ }
+ if buffc == x {
+ out = append(out, '[', buffer[x], ']')
+ } else {
+ out = append(out, buffer[x])
+ }
+ }
+ return string(out)
+}
+
+// Transduce input to ouutput
func (dat *DaTokenizer) Transduce(r io.Reader, w io.Writer) bool {
return dat.TransduceTokenWriter(r, NewTokenWriterSimple(w))
}
@@ -768,9 +789,13 @@
// Store a translation buffer as well, so characters don't
// have to be translated multiple times!
buffer := make([]rune, 1024)
- buffo := 0 // Buffer offset
+ bufft := 0 // Buffer token offset
+ buffc := 0 // Buffer current symbol
buffi := 0 // Buffer length
+ // The buffer is organized as follows:
+ // [ t[....c..]..i]
+
reader := bufio.NewReader(r)
defer w.Flush()
@@ -786,7 +811,7 @@
if newchar {
// Get from reader if buffer is empty
- if buffo >= buffi {
+ if buffc >= buffi {
if eof {
break
}
@@ -801,10 +826,10 @@
buffi++
}
- char = buffer[buffo]
+ char = buffer[buffc]
if DEBUG {
- fmt.Println("Current char", string(char), int(char), showBuffer(buffer, buffo, buffi))
+ fmt.Println("Current char", string(char), int(char), showBufferNew(buffer, bufft, buffc, buffi))
}
eot = false
@@ -835,10 +860,10 @@
if dat.array[dat.array[t0].getBase()+uint32(dat.epsilon)].getCheck() == t0 {
// Remember state for backtracking to last tokenend state
epsilonState = t0
- epsilonOffset = buffo
+ epsilonOffset = buffc
if DEBUG {
- fmt.Println("epsilonOffset is set to", buffo)
+ fmt.Println("epsilonOffset is set to", buffc)
}
}
}
@@ -876,11 +901,11 @@
// Try again with epsilon symbol, in case everything else failed
t0 = epsilonState
epsilonState = 0 // reset
- buffo = epsilonOffset
+ buffc = epsilonOffset
a = dat.epsilon
if DEBUG {
- fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
+ fmt.Println("Get from epsilon stack and set buffo!", showBufferNew(buffer, bufft, buffc, buffi))
}
} else {
@@ -898,24 +923,25 @@
// Transition consumes a character
if a != dat.epsilon {
- buffo++
+ buffc++
// Transition does not produce a character
- if buffo == 1 && ta.isNonToken() {
+ if buffc-bufft == 1 && ta.isNonToken() {
if DEBUG {
- fmt.Println("Nontoken forward", showBuffer(buffer, buffo, buffi))
+ fmt.Println("Nontoken forward", showBufferNew(buffer, bufft, buffc, buffi))
}
- rewindBuffer = true
+ bufft++
+ // rewindBuffer = true
}
} else {
// Transition marks the end of a token - so flush the buffer
- if buffo > 0 {
+ if buffc-bufft > 0 {
if DEBUG {
- fmt.Println("-> Flush buffer: [", string(buffer[:buffo]), "]", showBuffer(buffer, buffo, buffi))
+ fmt.Println("-> Flush buffer: [", string(buffer[bufft:buffc]), "]", showBuffer(buffer, buffc, buffi))
}
- w.Token(0, buffer[:buffo])
+ w.Token(0, buffer[bufft:buffc])
rewindBuffer = true
sentenceEnd = false
textEnd = false
@@ -929,31 +955,33 @@
if rewindBuffer {
if DEBUG {
- fmt.Println("-> Rewind buffer", buffo, buffi, epsilonOffset)
+ fmt.Println("-> Rewind buffer", bufft, buffc, buffi, epsilonOffset)
}
// TODO: Better as a ring buffer
- for x, i := range buffer[buffo:buffi] {
+ for x, i := range buffer[buffc:buffi] {
buffer[x] = i
}
- buffi -= buffo
+ buffi -= buffc
// epsilonOffset -= buffo
epsilonOffset = 0
epsilonState = 0
- buffo = 0
- if DEBUG {
- fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
- }
+ buffc = 0
+ bufft = 0
- if eot {
- eot = false
- textEnd = true
- w.TextEnd(0)
- if DEBUG {
- fmt.Println("END OF TEXT")
- }
+ if DEBUG {
+ fmt.Println("Remaining:", showBufferNew(buffer, bufft, buffc, buffi))
+ }
+ }
+
+ if eot {
+ eot = false
+ textEnd = true
+ w.TextEnd(0)
+ if DEBUG {
+ fmt.Println("END OF TEXT")
}
}
@@ -1029,9 +1057,9 @@
} else if epsilonState != 0 {
t0 = epsilonState
epsilonState = 0 // reset
- buffo = epsilonOffset
+ buffc = epsilonOffset
if DEBUG {
- fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
+ fmt.Println("Get from epsilon stack and set buffo!", showBufferNew(buffer, bufft, buffc, buffi))
}
goto PARSECHAR
}
diff --git a/datok_test.go b/datok_test.go
index 38eb474..6f2dc11 100644
--- a/datok_test.go
+++ b/datok_test.go
@@ -1038,8 +1038,13 @@
// BenchmarkToDoubleArray-4 63663 17675 ns/op 10703 B/op 29 allocs/op
// BenchmarkToDoubleArrayLarger-4 16 83535733 ns/op 6357874 B/op 2577 allocs/op
// BenchmarkTransduceMatrix-4 45362 25258 ns/op 12408 B/op 6 allocs/op
-// 2021-10-21 - Introduxe EOT
+// 2021-10-22 - Introduxe EOT
// BenchmarkDoubleArrayTransduce-4 43820 27661 ns/op 12408 B/op 6 allocs/op
// BenchmarkDoubleArrayConstruction-4 68259 16608 ns/op 10703 B/op 29 allocs/op
// BenchmarkDoubleArrayLarger-4 16 69889532 ns/op 6357901 B/op 2578 allocs/op
// BenchmarkMatrixTransduce-4 49426 25105 ns/op 12408 B/op 6 allocs/op
+// 2021-10-23 - Improve offset handling
+// BenchmarkDoubleArrayTransduce-4 41890 29729 ns/op 12408 B/op 6 allocs/op
+// BenchmarkDoubleArrayConstruction-4 74510 15879 ns/op 10703 B/op 29 allocs/op
+// BenchmarkDoubleArrayLarger-4 18 73752383 ns/op 6357956 B/op 2579 allocs/op
+// BenchmarkMatrixTransduce-4 46870 27140 ns/op 12408 B/op 6 allocs/op
diff --git a/matrix.go b/matrix.go
index 8c68959..98fc32c 100644
--- a/matrix.go
+++ b/matrix.go
@@ -313,10 +313,15 @@
return mat
}
+// Transduce input to ouutput
func (mat *MatrixTokenizer) Transduce(r io.Reader, w io.Writer) bool {
return mat.TransduceTokenWriter(r, NewTokenWriterSimple(w))
}
+// TransduceTokenWriter transduces an input string against
+// the matrix FSA. The rules are always greedy. If the
+// automaton fails, it takes the last possible token ending
+// branch.
func (mat *MatrixTokenizer) TransduceTokenWriter(r io.Reader, w TokenWriterI) bool {
var a int
var t0 uint32
@@ -335,9 +340,13 @@
textEnd := false
buffer := make([]rune, 1024)
- buffo := 0 // Buffer offset
+ bufft := 0 // Buffer token offset
+ buffc := 0 // Buffer current symbol
buffi := 0 // Buffer length
+ // The buffer is organized as follows:
+ // [ t[....c..]..i]
+
reader := bufio.NewReader(r)
defer w.Flush()
@@ -353,7 +362,7 @@
if newchar {
// Get from reader if buffer is empty
- if buffo >= buffi {
+ if buffc >= buffi {
if eof {
break
}
@@ -368,10 +377,10 @@
buffi++
}
- char = buffer[buffo]
+ char = buffer[buffc]
if DEBUG {
- fmt.Println("Current char", string(char), int(char), showBuffer(buffer, buffo, buffi))
+ fmt.Println("Current char", string(char), int(char), showBufferNew(buffer, bufft, buffc, buffi))
}
eot = false
@@ -408,10 +417,10 @@
// Just Remove
t0 &= ^FIRSTBIT
epsilonState = t0
- epsilonOffset = buffo
+ epsilonOffset = buffc
if DEBUG {
- fmt.Println("epsilonOffset is set to", buffo)
+ fmt.Println("epsilonOffset is set to", buffc)
}
}
}
@@ -445,11 +454,11 @@
// Try again with epsilon symbol, in case everything else failed
t0 = epsilonState
epsilonState = 0 // reset
- buffo = epsilonOffset
+ buffc = epsilonOffset
a = mat.epsilon
if DEBUG {
- fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
+ fmt.Println("Get from epsilon stack and set buffo!", showBufferNew(buffer, bufft, buffc, buffi))
}
} else {
@@ -467,23 +476,24 @@
// Transition consumes a character
if a != mat.epsilon {
- buffo++
+ buffc++
// Transition does not produce a character
- if buffo == 1 && (t&FIRSTBIT) != 0 {
+ if buffc-bufft == 1 && (t&FIRSTBIT) != 0 {
if DEBUG {
- fmt.Println("Nontoken forward", showBuffer(buffer, buffo, buffi))
+ fmt.Println("Nontoken forward", showBufferNew(buffer, bufft, buffc, buffi))
}
- rewindBuffer = true
+ bufft++
+ // rewindBuffer = true
}
} else {
// Transition marks the end of a token - so flush the buffer
- if buffo > 0 {
+ if buffc-bufft > 0 {
if DEBUG {
- fmt.Println("-> Flush buffer: [", string(buffer[:buffo]), "]", showBuffer(buffer, buffo, buffi))
+ fmt.Println("-> Flush buffer: [", string(buffer[bufft:buffc]), "]", showBufferNew(buffer, bufft, buffc, buffi))
}
- w.Token(0, buffer[:buffo])
+ w.Token(0, buffer[bufft:buffc])
rewindBuffer = true
sentenceEnd = false
textEnd = false
@@ -491,44 +501,41 @@
sentenceEnd = true
w.SentenceEnd(0)
}
- if DEBUG {
- fmt.Println("-> Newline")
- }
- // writer.WriteRune('\n')
}
// Rewind the buffer if necessary
if rewindBuffer {
if DEBUG {
- fmt.Println("-> Rewind buffer", buffo, buffi, epsilonOffset)
+ fmt.Println("-> Rewind buffer", bufft, buffc, buffi, epsilonOffset)
}
// TODO: Better as a ring buffer
- for x, i := range buffer[buffo:buffi] {
+ for x, i := range buffer[buffc:buffi] {
buffer[x] = i
}
- buffi -= buffo
+ buffi -= buffc
// epsilonOffset -= buffo
epsilonOffset = 0
epsilonState = 0
- buffo = 0
- if DEBUG {
- fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
- }
+ buffc = 0
+ bufft = 0
- if eot {
- eot = false
- textEnd = true
- w.TextEnd(0)
- if DEBUG {
- fmt.Println("END OF TEXT")
- }
+ if DEBUG {
+ fmt.Println("Remaining:", showBufferNew(buffer, bufft, buffc, buffi))
}
}
+ if eot {
+ eot = false
+ textEnd = true
+ w.TextEnd(0)
+ if DEBUG {
+ fmt.Println("END OF TEXT")
+ }
+ }
t &= ^FIRSTBIT
newchar = true
@@ -562,9 +569,9 @@
} else if epsilonState != 0 {
t0 = epsilonState
epsilonState = 0 // reset
- buffo = epsilonOffset
+ buffc = epsilonOffset
if DEBUG {
- fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
+ fmt.Println("Get from epsilon stack and set buffo!", showBufferNew(buffer, bufft, buffc, buffi))
}
goto PARSECHARM
}