Introduce alternative matrix representation

commit: 1c34ce6b9116f8cb8a2ba4acb03df3ccabb0e48b [log] [tgz]
author: Akron <nils@diewald-online.de> Thu Sep 23 23:27:39 2021 +0200
committer: Akron <nils@diewald-online.de> Thu Sep 23 23:27:39 2021 +0200
tree: 5d6e8157448d572d3a649dbc1a1be4cfb3ad266e
parent: 0d0daa23db24dff1265a1f496292daf04a9ae882 [diff] [blame]
diff --git a/matrix.go b/matrix.go
new file mode 100644
index 0000000..0ec940e
--- /dev/null
+++ b/matrix.go

@@ -0,0 +1,352 @@
+package datok
+
+import (
+	"bufio"
+	"fmt"
+	"io"
+)
+
+type MatrixTokenizer struct {
+	sigma      map[rune]int
+	sigmaASCII [256]int
+	array      []int
+	stateCount int
+
+	// Special symbols in sigma
+	epsilon  int
+	unknown  int
+	identity int
+	final    int
+	tokenend int
+}
+
+// ToMatrix turns the intermediate tokenizer into a
+// matrix representation.
+func (auto *Automaton) ToMatrix() *MatrixTokenizer {
+
+	mat := &MatrixTokenizer{
+		sigma:      make(map[rune]int),
+		final:      auto.final,
+		unknown:    auto.unknown,
+		identity:   auto.identity,
+		epsilon:    auto.epsilon,
+		tokenend:   auto.tokenend,
+		stateCount: auto.stateCount,
+	}
+
+	mat.array = make([]int, (auto.stateCount+1)*(auto.sigmaCount+1))
+
+	for num, sym := range auto.sigmaRev {
+		if int(sym) < 256 {
+			mat.sigmaASCII[int(sym)] = num
+		}
+		mat.sigma[sym] = num
+		if num > auto.sigmaCount {
+			panic("sigmaCount is smaller")
+		}
+	}
+	remember := make([]bool, auto.stateCount+2)
+
+	// Store all transitions in matrix
+	var toMatrix func([]int, int)
+
+	toMatrix = func(matrix []int, start int) {
+		if start > auto.stateCount {
+			panic("stateCount is smaller")
+		}
+		if remember[start] {
+			return
+		}
+		remember[start] = true
+		for alpha, t := range auto.transitions[start] {
+			matrix[(alpha-1)*auto.stateCount+start] = t.end
+
+			// Mark nontoken transitions
+			if t.nontoken {
+				matrix[(alpha-1)*auto.stateCount+start] *= -1
+			}
+
+			toMatrix(matrix, t.end)
+		}
+	}
+
+	toMatrix(mat.array, 1)
+
+	return mat
+}
+
+func (mat *MatrixTokenizer) Transduce(r io.Reader, w io.Writer) bool {
+	var a int
+	var t0 int
+	t := int(1) // Initial state
+	var ok, rewindBuffer bool
+
+	// Remember the last position of a possible tokenend,
+	// in case the automaton fails.
+	epsilonState := int(0)
+	epsilonOffset := 0
+
+	buffer := make([]rune, 1024)
+	buffo := 0 // Buffer offset
+	buffi := 0 // Buffer length
+
+	reader := bufio.NewReader(r)
+	writer := bufio.NewWriter(w)
+	defer writer.Flush()
+
+	var char rune
+
+	var err error
+	eof := false
+	newchar := true
+
+PARSECHARM:
+	for {
+
+		if newchar {
+			// Get from reader if buffer is empty
+			if buffo >= buffi {
+				if eof {
+					break
+				}
+				char, _, err = reader.ReadRune()
+
+				// No more runes to read
+				if err != nil {
+					eof = true
+					break
+				}
+				buffer[buffi] = char
+				buffi++
+			}
+
+			char = buffer[buffo]
+
+			if DEBUG {
+				fmt.Println("Current char", string(char), showBuffer(buffer, buffo, buffi))
+			}
+
+			// TODO:
+			//   Better not repeatedly check for a!
+			//   Possibly keep a buffer with a.
+			if int(char) < 256 {
+				a = mat.sigmaASCII[int(char)]
+			} else {
+				a, ok = mat.sigma[char]
+				if !ok {
+					a = 0
+				}
+			}
+
+			// Use identity symbol if character is not in sigma
+			if a == 0 && mat.identity != -1 {
+				a = mat.identity
+			}
+
+			t0 = t
+
+			// Check for epsilon transitions and remember
+
+			if mat.array[(mat.epsilon-1)*mat.stateCount+t0] != 0 {
+				// Remember state for backtracking to last tokenend state
+				epsilonState = t0
+				epsilonOffset = buffo
+			}
+		}
+
+		// Checks a transition based on t0, a and buffo
+		t = mat.array[(int(a)-1)*mat.stateCount+int(t0)]
+		// t = mat.array[t0].getBase() + uint32(a)
+		// ta := dat.array[t]
+
+		if DEBUG {
+			// Char is only relevant if set
+			fmt.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
+			/*
+				if false {
+					fmt.Println(dat.outgoing(t0))
+				}
+			*/
+		}
+
+		// Check if the transition is invalid according to the double array
+		// if t > dat.array[1].getCheck() || ta.getCheck() != t0 {
+		if t == 0 {
+
+			if DEBUG {
+				fmt.Println("Match is not fine!")
+			}
+
+			if !ok && a == mat.identity {
+
+				// Try again with unknown symbol, in case identity failed
+				// Char is only relevant when set
+				if DEBUG {
+					fmt.Println("UNKNOWN symbol", string(char), "->", mat.unknown)
+				}
+				a = mat.unknown
+
+			} else if a != mat.epsilon {
+
+				// Try again with epsilon symbol, in case everything else failed
+				t0 = epsilonState
+				epsilonState = 0 // reset
+				buffo = epsilonOffset
+				a = mat.epsilon
+
+				if DEBUG {
+					fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
+				}
+
+			} else {
+				break
+			}
+
+			newchar = false
+			continue
+		}
+
+		// Transition was successful
+		rewindBuffer = false
+
+		// Transition consumes a character
+		if a != mat.epsilon {
+
+			buffo++
+
+			// Transition does not produce a character
+			// if buffo == 1 && ta.isNonToken() {
+			if buffo == 1 && t < 0 {
+				if DEBUG {
+					fmt.Println("Nontoken forward", showBuffer(buffer, buffo, buffi))
+				}
+				rewindBuffer = true
+			}
+
+		} else {
+			// Transition marks the end of a token - so flush the buffer
+
+			if buffi > 0 {
+				if DEBUG {
+					fmt.Println("-> Flush buffer: [", string(buffer[:buffo]), "]", showBuffer(buffer, buffo, buffi))
+				}
+				writer.WriteString(string(buffer[:buffo]))
+				rewindBuffer = true
+			}
+			if DEBUG {
+				fmt.Println("-> Newline")
+			}
+			writer.WriteRune('\n')
+		}
+
+		// Rewind the buffer if necessary
+		if rewindBuffer {
+
+			// TODO: Better as a ring buffer
+			for x, i := range buffer[buffo:buffi] {
+				buffer[x] = i
+			}
+
+			buffi -= buffo
+			epsilonOffset -= buffo
+			buffo = 0
+			if DEBUG {
+				fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
+			}
+		}
+
+		// Move to representative state
+		/*
+			if ta.isSeparate() {
+				t = ta.getBase()
+				ta = dat.array[t]
+
+				if DEBUG {
+					fmt.Println("Representative pointing to", t)
+				}
+			}
+		*/
+
+		// Ignore nontoken mark
+		if t < 0 {
+			t *= -1
+		}
+
+		newchar = true
+
+		// TODO:
+		//   Prevent endless epsilon loops!
+	}
+
+	// Input reader is not yet finished
+	if !eof {
+		if DEBUG {
+			fmt.Println("Not at the end")
+		}
+		return false
+	}
+
+	if DEBUG {
+		fmt.Println("Entering final check")
+	}
+	/*
+		// Automaton is in a final state, so flush the buffer and return
+		x := dat.array[t].getBase() + uint32(dat.final)
+
+		if x < dat.array[1].getCheck() && dat.array[x].getCheck() == t {
+
+			if buffi > 0 {
+				if DEBUG {
+					fmt.Println("-> Flush buffer: [", string(buffer[:buffi]), "]")
+				}
+				writer.WriteString(string(buffer[:buffi]))
+
+				if dat.array[t].isTokenEnd() {
+					writer.WriteRune('\n')
+					if DEBUG {
+						fmt.Println("-> Newline")
+					}
+				}
+			}
+
+			// Add an additional sentence ending, if the file is over but no explicit
+			// sentence split was reached. This may be controversial and therefore
+			// optional via parameter.
+			if !dat.array[t0].isTokenEnd() {
+				writer.WriteRune('\n')
+				if DEBUG {
+					fmt.Println("-> Newline")
+				}
+			}
+
+			// TODO:
+			//   There may be a new line at the end, from an epsilon,
+			//   so we may need to go on!
+			return true
+		}
+	*/
+
+	// Check epsilon transitions until a final state is reached
+	t0 = t
+	// t = dat.array[t0].getBase() + uint32(dat.epsilon)
+	t = mat.array[(int(mat.epsilon)-1)*mat.stateCount+int(t0)]
+	a = mat.epsilon
+	newchar = false
+	// if dat.array[t].getCheck() == t0 {
+	// t can't be < 0
+	if t > 0 {
+		// Remember state for backtracking to last tokenend state
+		goto PARSECHARM
+
+	} else if epsilonState != 0 {
+		t0 = epsilonState
+		epsilonState = 0 // reset
+		buffo = epsilonOffset
+		if DEBUG {
+			fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
+		}
+		goto PARSECHARM
+	}
+	return false
+
+}
commit	1c34ce6b9116f8cb8a2ba4acb03df3ccabb0e48b	[log] [tgz]
author	Akron <nils@diewald-online.de>	Thu Sep 23 23:27:39 2021 +0200
committer	Akron <nils@diewald-online.de>	Thu Sep 23 23:27:39 2021 +0200
tree	5d6e8157448d572d3a649dbc1a1be4cfb3ad266e
parent	0d0daa23db24dff1265a1f496292daf04a9ae882 [diff] [blame]