Introduce alternative matrix representation

commit: 1c34ce6b9116f8cb8a2ba4acb03df3ccabb0e48b [log] [tgz]
author: Akron <nils@diewald-online.de> Thu Sep 23 23:27:39 2021 +0200
committer: Akron <nils@diewald-online.de> Thu Sep 23 23:27:39 2021 +0200
tree: 5d6e8157448d572d3a649dbc1a1be4cfb3ad266e
parent: 0d0daa23db24dff1265a1f496292daf04a9ae882 [diff]
diff --git a/datok.go b/datok.go
index b46da42..4f7ac6f 100644
--- a/datok.go
+++ b/datok.go

@@ -80,21 +80,21 @@
 // into a double array representation.
 //
 // This is based on Mizobuchi et al (2000), p.128
-func (tok *Tokenizer) ToDoubleArray() *DaTokenizer {
+func (auto *Automaton) ToDoubleArray() *DaTokenizer {
 
 	dat := &DaTokenizer{
 		sigma:      make(map[rune]int),
 		transCount: -1,
-		final:      tok.final,
-		unknown:    tok.unknown,
-		identity:   tok.identity,
-		epsilon:    tok.epsilon,
-		tokenend:   tok.tokenend,
+		final:      auto.final,
+		unknown:    auto.unknown,
+		identity:   auto.identity,
+		epsilon:    auto.epsilon,
+		tokenend:   auto.tokenend,
 	}
 
 	dat.resize(dat.final)
 
-	for num, sym := range tok.sigmaRev {
+	for num, sym := range auto.sigmaRev {
 		if int(sym) < 256 {
 			dat.sigmaASCII[int(sym)] = num
 		}
@@ -110,7 +110,7 @@
 
 	// Create a mapping from s (in Ms aka Intermediate FSA)
 	// to t (in Mt aka Double Array FSA)
-	table := make([]*mapping, tok.arcCount+1)
+	table := make([]*mapping, auto.arcCount+1)
 	// tableQueue := make([]int, tok.arcCount+1)
 
 	// Initialize with the start state
@@ -119,7 +119,7 @@
 	size++
 
 	// Allocate space for the outgoing symbol range
-	A := make([]int, 0, tok.sigmaCount)
+	A := make([]int, 0, auto.sigmaCount)
 	// tableLookup := make([]uint32, tok.arcCount+2) // make(map[int]uint32)
 	// tableLookup[1] = 1
 
@@ -135,7 +135,7 @@
 		// Following the paper, here the state t can be remembered
 		// in the set of states St
 		A = A[:0]
-		tok.getSet(s, &A)
+		auto.getSet(s, &A)
 
 		// Set base to the first free slot in the double array
 		// base = dat.xCheck(A)
@@ -151,9 +151,9 @@
 		// Iterate over all outgoing symbols
 		for _, a := range A {
 
-			if a != tok.final {
+			if a != auto.final {
 
-				atrans = tok.transitions[s][a]
+				atrans = auto.transitions[s][a]
 
 				// Aka g(s, a)
 				s1 = atrans.end

diff --git a/datok_test.go b/datok_test.go
index 2dae904..67617b2 100644
--- a/datok_test.go
+++ b/datok_test.go

@@ -11,15 +11,15 @@
 	"github.com/stretchr/testify/assert"
 )
 
-func tmatch(dat *DaTokenizer, s string) bool {
+func tmatch(tok Tokenizer, s string) bool {
 	b := make([]byte, 0, 2048)
 	w := bytes.NewBuffer(b)
-	return dat.Transduce(strings.NewReader(s), w)
+	return tok.Transduce(strings.NewReader(s), w)
 }
 
-func ttokenize(dat *DaTokenizer, w *bytes.Buffer, str string) []string {
+func ttokenize(tok Tokenizer, w *bytes.Buffer, str string) []string {
 	w.Reset()
-	ok := dat.Transduce(strings.NewReader(str), w)
+	ok := tok.Transduce(strings.NewReader(str), w)
 	if !ok {
 		return []string{}
 	}

diff --git a/fomafile.go b/fomafile.go
index 46f9d71..1d46500 100644
--- a/fomafile.go
+++ b/fomafile.go

@@ -27,13 +27,18 @@
 	tokenend bool
 }
 
-// Tokenizer is the intermediate representation
+type Tokenizer interface {
+	Transduce(r io.Reader, w io.Writer) bool
+}
+
+// Automaton is the intermediate representation
 // of the tokenizer.
-type Tokenizer struct {
+type Automaton struct {
 	sigmaRev    map[int]rune
 	sigmaMCS    map[int]string
 	arcCount    int
 	sigmaCount  int
+	stateCount  int
 	transitions []map[int]*edge
 
 	// Special symbols in sigma
@@ -47,7 +52,7 @@
 // ParseFoma reads the FST from a foma file
 // and creates an internal representation,
 // in case it follows the tokenizer's convention.
-func LoadFomaFile(file string) *Tokenizer {
+func LoadFomaFile(file string) *Automaton {
 	f, err := os.Open(file)
 	if err != nil {
 		log.Print(err)
@@ -68,10 +73,10 @@
 // ParseFoma reads the FST from a foma file reader
 // and creates an internal representation,
 // in case it follows the tokenizer's convention.
-func ParseFoma(ior io.Reader) *Tokenizer {
+func ParseFoma(ior io.Reader) *Automaton {
 	r := bufio.NewReader(ior)
 
-	tok := &Tokenizer{
+	auto := &Automaton{
 		sigmaRev: make(map[int]rune),
 		sigmaMCS: make(map[int]string),
 		epsilon:  -1,
@@ -111,8 +116,8 @@
 
 				// Adds a final transition symbol to sigma
 				// written as '#' in Mizobuchi et al (2000)
-				tok.sigmaCount++
-				tok.final = tok.sigmaCount
+				auto.sigmaCount++
+				auto.final = auto.sigmaCount
 
 			} else if strings.HasPrefix(line, "##sigma##") {
 
@@ -164,7 +169,7 @@
 					log.Print("Can't read arccount")
 					return nil
 				}
-				tok.arcCount = elemint[0]
+				auto.arcCount = elemint[0]
 
 				elemint[0], err = strconv.Atoi(elem[2])
 				if err != nil {
@@ -175,7 +180,8 @@
 				// States start at 1 in Mizobuchi et al (2000),
 				// as the state 0 is associated with a fail.
 				// Initialize states and transitions
-				tok.transitions = make([]map[int]*edge, elemint[0]+1)
+				auto.stateCount = elemint[0]
+				auto.transitions = make([]map[int]*edge, elemint[0]+1)
 				continue
 			}
 		case STATES:
@@ -241,13 +247,13 @@
 							if final == 1 {
 
 								// Initialize outgoing states
-								if tok.transitions[state+1] == nil {
-									tok.transitions[state+1] = make(map[int]*edge)
+								if auto.transitions[state+1] == nil {
+									auto.transitions[state+1] = make(map[int]*edge)
 								}
 
 								// TODO:
 								//   Maybe this is less relevant for tokenizers
-								tok.transitions[state+1][tok.final] = &edge{}
+								auto.transitions[state+1][auto.final] = &edge{}
 							}
 							continue
 						} else {
@@ -283,9 +289,9 @@
 
 				// Only a limited list of transitions are allowed
 				if inSym != outSym {
-					if outSym == tok.tokenend && inSym == tok.epsilon {
+					if outSym == auto.tokenend && inSym == auto.epsilon {
 						tokenend = true
-					} else if outSym == tok.epsilon {
+					} else if outSym == auto.epsilon {
 						nontoken = true
 					} else {
 						log.Println(
@@ -297,19 +303,19 @@
 								":" +
 								strconv.Itoa(outSym) +
 								") (" +
-								string(tok.sigmaRev[inSym]) +
+								string(auto.sigmaRev[inSym]) +
 								":" +
-								string(tok.sigmaRev[outSym]) +
+								string(auto.sigmaRev[outSym]) +
 								")")
 						return nil
 					}
-				} else if inSym == tok.tokenend {
+				} else if inSym == auto.tokenend {
 					// Ignore tokenend accepting arcs
 					continue
-				} else if inSym == tok.epsilon {
+				} else if inSym == auto.epsilon {
 					log.Println("General epsilon transitions are not supported")
 					return nil
-				} else if tok.sigmaMCS[inSym] != "" {
+				} else if auto.sigmaMCS[inSym] != "" {
 					// log.Fatalln("Non supported character", tok.sigmaMCS[inSym])
 					// Ignore MCS transitions
 					continue
@@ -325,20 +331,20 @@
 				}
 
 				// Initialize outgoing states
-				if tok.transitions[state+1] == nil {
-					tok.transitions[state+1] = make(map[int]*edge)
+				if auto.transitions[state+1] == nil {
+					auto.transitions[state+1] = make(map[int]*edge)
 				}
 
 				// Ignore transitions with invalid symbols
 				if inSym >= 0 {
-					tok.transitions[state+1][inSym] = targetObj
+					auto.transitions[state+1][inSym] = targetObj
 				}
 
 				// Add final transition
 				if final == 1 {
 					// TODO:
 					//   Maybe this is less relevant for tokenizers
-					tok.transitions[state+1][tok.final] = &edge{}
+					auto.transitions[state+1][auto.final] = &edge{}
 				}
 
 				if DEBUG {
@@ -349,9 +355,9 @@
 						":",
 						outSym,
 						") (",
-						string(tok.sigmaRev[inSym]),
+						string(auto.sigmaRev[inSym]),
 						":",
-						string(tok.sigmaRev[outSym]),
+						string(auto.sigmaRev[outSym]),
 						")",
 						";",
 						"TE:", tokenend,
@@ -376,7 +382,7 @@
 					return nil
 				}
 
-				tok.sigmaCount = number
+				auto.sigmaCount = number
 
 				var symbol rune
 
@@ -390,26 +396,26 @@
 					switch elem[1] {
 					case "@_EPSILON_SYMBOL_@":
 						{
-							tok.epsilon = number
+							auto.epsilon = number
 						}
 					case "@_UNKNOWN_SYMBOL_@":
 						{
-							tok.unknown = number
+							auto.unknown = number
 						}
 
 					case "@_IDENTITY_SYMBOL_@":
 						{
-							tok.identity = number
+							auto.identity = number
 						}
 
 					case "@_TOKEN_SYMBOL_@":
 						{
-							tok.tokenend = number
+							auto.tokenend = number
 						}
 					default:
 						{
 							// MCS not supported
-							tok.sigmaMCS[number] = line
+							auto.sigmaMCS[number] = line
 						}
 					}
 					continue
@@ -422,24 +428,24 @@
 					}
 					if len(line) != 1 {
 						// MCS not supported
-						tok.sigmaMCS[number] = line
+						auto.sigmaMCS[number] = line
 						continue
 					}
 					symbol = rune('\n')
 				}
 
-				tok.sigmaRev[number] = symbol
+				auto.sigmaRev[number] = symbol
 			}
 		}
 	}
-	tok.sigmaMCS = nil
-	return tok
+	auto.sigmaMCS = nil
+	return auto
 }
 
 // Set alphabet A to the list of all symbols
 // outgoing from s
-func (tok *Tokenizer) getSet(s int, A *[]int) {
-	for a := range tok.transitions[s] {
+func (auto *Automaton) getSet(s int, A *[]int) {
+	for a := range auto.transitions[s] {
 		*A = append(*A, a)
 	}
 

diff --git a/matrix.go b/matrix.go
new file mode 100644
index 0000000..0ec940e
--- /dev/null
+++ b/matrix.go

@@ -0,0 +1,352 @@
+package datok
+
+import (
+	"bufio"
+	"fmt"
+	"io"
+)
+
+type MatrixTokenizer struct {
+	sigma      map[rune]int
+	sigmaASCII [256]int
+	array      []int
+	stateCount int
+
+	// Special symbols in sigma
+	epsilon  int
+	unknown  int
+	identity int
+	final    int
+	tokenend int
+}
+
+// ToMatrix turns the intermediate tokenizer into a
+// matrix representation.
+func (auto *Automaton) ToMatrix() *MatrixTokenizer {
+
+	mat := &MatrixTokenizer{
+		sigma:      make(map[rune]int),
+		final:      auto.final,
+		unknown:    auto.unknown,
+		identity:   auto.identity,
+		epsilon:    auto.epsilon,
+		tokenend:   auto.tokenend,
+		stateCount: auto.stateCount,
+	}
+
+	mat.array = make([]int, (auto.stateCount+1)*(auto.sigmaCount+1))
+
+	for num, sym := range auto.sigmaRev {
+		if int(sym) < 256 {
+			mat.sigmaASCII[int(sym)] = num
+		}
+		mat.sigma[sym] = num
+		if num > auto.sigmaCount {
+			panic("sigmaCount is smaller")
+		}
+	}
+	remember := make([]bool, auto.stateCount+2)
+
+	// Store all transitions in matrix
+	var toMatrix func([]int, int)
+
+	toMatrix = func(matrix []int, start int) {
+		if start > auto.stateCount {
+			panic("stateCount is smaller")
+		}
+		if remember[start] {
+			return
+		}
+		remember[start] = true
+		for alpha, t := range auto.transitions[start] {
+			matrix[(alpha-1)*auto.stateCount+start] = t.end
+
+			// Mark nontoken transitions
+			if t.nontoken {
+				matrix[(alpha-1)*auto.stateCount+start] *= -1
+			}
+
+			toMatrix(matrix, t.end)
+		}
+	}
+
+	toMatrix(mat.array, 1)
+
+	return mat
+}
+
+func (mat *MatrixTokenizer) Transduce(r io.Reader, w io.Writer) bool {
+	var a int
+	var t0 int
+	t := int(1) // Initial state
+	var ok, rewindBuffer bool
+
+	// Remember the last position of a possible tokenend,
+	// in case the automaton fails.
+	epsilonState := int(0)
+	epsilonOffset := 0
+
+	buffer := make([]rune, 1024)
+	buffo := 0 // Buffer offset
+	buffi := 0 // Buffer length
+
+	reader := bufio.NewReader(r)
+	writer := bufio.NewWriter(w)
+	defer writer.Flush()
+
+	var char rune
+
+	var err error
+	eof := false
+	newchar := true
+
+PARSECHARM:
+	for {
+
+		if newchar {
+			// Get from reader if buffer is empty
+			if buffo >= buffi {
+				if eof {
+					break
+				}
+				char, _, err = reader.ReadRune()
+
+				// No more runes to read
+				if err != nil {
+					eof = true
+					break
+				}
+				buffer[buffi] = char
+				buffi++
+			}
+
+			char = buffer[buffo]
+
+			if DEBUG {
+				fmt.Println("Current char", string(char), showBuffer(buffer, buffo, buffi))
+			}
+
+			// TODO:
+			//   Better not repeatedly check for a!
+			//   Possibly keep a buffer with a.
+			if int(char) < 256 {
+				a = mat.sigmaASCII[int(char)]
+			} else {
+				a, ok = mat.sigma[char]
+				if !ok {
+					a = 0
+				}
+			}
+
+			// Use identity symbol if character is not in sigma
+			if a == 0 && mat.identity != -1 {
+				a = mat.identity
+			}
+
+			t0 = t
+
+			// Check for epsilon transitions and remember
+
+			if mat.array[(mat.epsilon-1)*mat.stateCount+t0] != 0 {
+				// Remember state for backtracking to last tokenend state
+				epsilonState = t0
+				epsilonOffset = buffo
+			}
+		}
+
+		// Checks a transition based on t0, a and buffo
+		t = mat.array[(int(a)-1)*mat.stateCount+int(t0)]
+		// t = mat.array[t0].getBase() + uint32(a)
+		// ta := dat.array[t]
+
+		if DEBUG {
+			// Char is only relevant if set
+			fmt.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
+			/*
+				if false {
+					fmt.Println(dat.outgoing(t0))
+				}
+			*/
+		}
+
+		// Check if the transition is invalid according to the double array
+		// if t > dat.array[1].getCheck() || ta.getCheck() != t0 {
+		if t == 0 {
+
+			if DEBUG {
+				fmt.Println("Match is not fine!")
+			}
+
+			if !ok && a == mat.identity {
+
+				// Try again with unknown symbol, in case identity failed
+				// Char is only relevant when set
+				if DEBUG {
+					fmt.Println("UNKNOWN symbol", string(char), "->", mat.unknown)
+				}
+				a = mat.unknown
+
+			} else if a != mat.epsilon {
+
+				// Try again with epsilon symbol, in case everything else failed
+				t0 = epsilonState
+				epsilonState = 0 // reset
+				buffo = epsilonOffset
+				a = mat.epsilon
+
+				if DEBUG {
+					fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
+				}
+
+			} else {
+				break
+			}
+
+			newchar = false
+			continue
+		}
+
+		// Transition was successful
+		rewindBuffer = false
+
+		// Transition consumes a character
+		if a != mat.epsilon {
+
+			buffo++
+
+			// Transition does not produce a character
+			// if buffo == 1 && ta.isNonToken() {
+			if buffo == 1 && t < 0 {
+				if DEBUG {
+					fmt.Println("Nontoken forward", showBuffer(buffer, buffo, buffi))
+				}
+				rewindBuffer = true
+			}
+
+		} else {
+			// Transition marks the end of a token - so flush the buffer
+
+			if buffi > 0 {
+				if DEBUG {
+					fmt.Println("-> Flush buffer: [", string(buffer[:buffo]), "]", showBuffer(buffer, buffo, buffi))
+				}
+				writer.WriteString(string(buffer[:buffo]))
+				rewindBuffer = true
+			}
+			if DEBUG {
+				fmt.Println("-> Newline")
+			}
+			writer.WriteRune('\n')
+		}
+
+		// Rewind the buffer if necessary
+		if rewindBuffer {
+
+			// TODO: Better as a ring buffer
+			for x, i := range buffer[buffo:buffi] {
+				buffer[x] = i
+			}
+
+			buffi -= buffo
+			epsilonOffset -= buffo
+			buffo = 0
+			if DEBUG {
+				fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
+			}
+		}
+
+		// Move to representative state
+		/*
+			if ta.isSeparate() {
+				t = ta.getBase()
+				ta = dat.array[t]
+
+				if DEBUG {
+					fmt.Println("Representative pointing to", t)
+				}
+			}
+		*/
+
+		// Ignore nontoken mark
+		if t < 0 {
+			t *= -1
+		}
+
+		newchar = true
+
+		// TODO:
+		//   Prevent endless epsilon loops!
+	}
+
+	// Input reader is not yet finished
+	if !eof {
+		if DEBUG {
+			fmt.Println("Not at the end")
+		}
+		return false
+	}
+
+	if DEBUG {
+		fmt.Println("Entering final check")
+	}
+	/*
+		// Automaton is in a final state, so flush the buffer and return
+		x := dat.array[t].getBase() + uint32(dat.final)
+
+		if x < dat.array[1].getCheck() && dat.array[x].getCheck() == t {
+
+			if buffi > 0 {
+				if DEBUG {
+					fmt.Println("-> Flush buffer: [", string(buffer[:buffi]), "]")
+				}
+				writer.WriteString(string(buffer[:buffi]))
+
+				if dat.array[t].isTokenEnd() {
+					writer.WriteRune('\n')
+					if DEBUG {
+						fmt.Println("-> Newline")
+					}
+				}
+			}
+
+			// Add an additional sentence ending, if the file is over but no explicit
+			// sentence split was reached. This may be controversial and therefore
+			// optional via parameter.
+			if !dat.array[t0].isTokenEnd() {
+				writer.WriteRune('\n')
+				if DEBUG {
+					fmt.Println("-> Newline")
+				}
+			}
+
+			// TODO:
+			//   There may be a new line at the end, from an epsilon,
+			//   so we may need to go on!
+			return true
+		}
+	*/
+
+	// Check epsilon transitions until a final state is reached
+	t0 = t
+	// t = dat.array[t0].getBase() + uint32(dat.epsilon)
+	t = mat.array[(int(mat.epsilon)-1)*mat.stateCount+int(t0)]
+	a = mat.epsilon
+	newchar = false
+	// if dat.array[t].getCheck() == t0 {
+	// t can't be < 0
+	if t > 0 {
+		// Remember state for backtracking to last tokenend state
+		goto PARSECHARM
+
+	} else if epsilonState != 0 {
+		t0 = epsilonState
+		epsilonState = 0 // reset
+		buffo = epsilonOffset
+		if DEBUG {
+			fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
+		}
+		goto PARSECHARM
+	}
+	return false
+
+}

diff --git a/matrix_test.go b/matrix_test.go
new file mode 100644
index 0000000..18ed3a2
--- /dev/null
+++ b/matrix_test.go

@@ -0,0 +1,33 @@
+package datok
+
+import (
+	"bytes"
+	"strings"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestFullTokenizerMatrix(t *testing.T) {
+	assert := assert.New(t)
+	foma := LoadFomaFile("testdata/simpletok.fst")
+	assert.NotNil(foma)
+
+	mat := foma.ToMatrix()
+
+	r := strings.NewReader("  wald   gehen Da kann\t man was \"erleben\"!")
+	b := make([]byte, 0, 2048)
+	w := bytes.NewBuffer(b)
+	var tokens []string
+	mat.Transduce(r, w)
+	tokens = strings.Split(w.String(), "\n")
+	assert.Equal(len(tokens), 9)
+	assert.Equal("wald", tokens[0])
+	assert.Equal("gehen", tokens[1])
+	assert.Equal("Da", tokens[2])
+	assert.Equal("kann", tokens[3])
+	assert.Equal("man", tokens[4])
+	assert.Equal("was", tokens[5])
+	assert.Equal("\"erleben\"", tokens[6])
+	assert.Equal("!", tokens[7])
+}
commit	1c34ce6b9116f8cb8a2ba4acb03df3ccabb0e48b	[log] [tgz]
author	Akron <nils@diewald-online.de>	Thu Sep 23 23:27:39 2021 +0200
committer	Akron <nils@diewald-online.de>	Thu Sep 23 23:27:39 2021 +0200
tree	5d6e8157448d572d3a649dbc1a1be4cfb3ad266e
parent	0d0daa23db24dff1265a1f496292daf04a9ae882 [diff]