matrix.go - KorAP/Datok - Gitiles

 package datok

 import (
 	"bufio"
 	"fmt"
 	"io"
 )

 type MatrixTokenizer struct {
 	sigma      map[rune]int
 	sigmaASCII [256]int
 	array      []int
 	stateCount int

 	// Special symbols in sigma
 	epsilon  int
 	unknown  int
 	identity int
 	final    int
 	tokenend int
 }

 // ToMatrix turns the intermediate tokenizer into a
 // matrix representation.
 func (auto *Automaton) ToMatrix() *MatrixTokenizer {

 	mat := &MatrixTokenizer{
 		sigma:      make(map[rune]int),
 		final:      auto.final,
 		unknown:    auto.unknown,
 		identity:   auto.identity,
 		epsilon:    auto.epsilon,
 		tokenend:   auto.tokenend,
 		stateCount: auto.stateCount,
 	}

 	mat.array = make([]int, (auto.stateCount+1)*(auto.sigmaCount+1))

 	for num, sym := range auto.sigmaRev {
 		if int(sym) < 256 {
 			mat.sigmaASCII[int(sym)] = num
 		}
 		mat.sigma[sym] = num
 		if num > auto.sigmaCount {
 			panic("sigmaCount is smaller")
 		}
 	}
 	remember := make([]bool, auto.stateCount+2)

 	// Store all transitions in matrix
 	var toMatrix func([]int, int)

 	toMatrix = func(matrix []int, start int) {
 		if start > auto.stateCount {
 			panic("stateCount is smaller")
 		}
 		if remember[start] {
 			return
 		}
 		remember[start] = true
 		for alpha, t := range auto.transitions[start] {
 			matrix[(alpha-1)*auto.stateCount+start] = t.end

 			// Mark nontoken transitions
 			if t.nontoken {
 				matrix[(alpha-1)*auto.stateCount+start] *= -1
 			}

 			toMatrix(matrix, t.end)
 		}
 	}

 	toMatrix(mat.array, 1)

 	return mat
 }

 func (mat *MatrixTokenizer) Transduce(r io.Reader, w io.Writer) bool {
 	var a int
 	var t0 int
 	t := int(1) // Initial state
 	var ok, rewindBuffer bool

 	// Remember the last position of a possible tokenend,
 	// in case the automaton fails.
 	epsilonState := int(0)
 	epsilonOffset := 0

 	buffer := make([]rune, 1024)
 	buffo := 0 // Buffer offset
 	buffi := 0 // Buffer length

 	reader := bufio.NewReader(r)
 	writer := bufio.NewWriter(w)
 	defer writer.Flush()

 	var char rune

 	var err error
 	eof := false
 	newchar := true

 PARSECHARM:
 	for {

 		if newchar {
 			// Get from reader if buffer is empty
 			if buffo >= buffi {
 				if eof {
 					break
 				}
 				char, _, err = reader.ReadRune()

 				// No more runes to read
 				if err != nil {
 					eof = true
 					break
 				}
 				buffer[buffi] = char
 				buffi++
 			}

 			char = buffer[buffo]

 			if DEBUG {
 				fmt.Println("Current char", string(char), showBuffer(buffer, buffo, buffi))
 			}

 			// TODO:
 			//   Better not repeatedly check for a!
 			//   Possibly keep a buffer with a.
 			if int(char) < 256 {
 				a = mat.sigmaASCII[int(char)]
 			} else {
 				a, ok = mat.sigma[char]
 				if !ok {
 					a = 0
 				}
 			}

 			// Use identity symbol if character is not in sigma
 			if a == 0 && mat.identity != -1 {
 				a = mat.identity
 			}

 			t0 = t

 			// Check for epsilon transitions and remember

 			if mat.array[(mat.epsilon-1)*mat.stateCount+t0] != 0 {
 				// Remember state for backtracking to last tokenend state
 				epsilonState = t0
 				epsilonOffset = buffo
 			}
 		}

 		// Checks a transition based on t0, a and buffo
 		t = mat.array[(int(a)-1)*mat.stateCount+int(t0)]
 		// t = mat.array[t0].getBase() + uint32(a)
 		// ta := dat.array[t]

 		if DEBUG {
 			// Char is only relevant if set
 			fmt.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
 			/*
 				if false {
 					fmt.Println(dat.outgoing(t0))
 				}
 			*/
 		}

 		// Check if the transition is invalid according to the double array
 		// if t > dat.array[1].getCheck() || ta.getCheck() != t0 {
 		if t == 0 {

 			if DEBUG {
 				fmt.Println("Match is not fine!")
 			}

 			if !ok && a == mat.identity {

 				// Try again with unknown symbol, in case identity failed
 				// Char is only relevant when set
 				if DEBUG {
 					fmt.Println("UNKNOWN symbol", string(char), "->", mat.unknown)
 				}
 				a = mat.unknown

 			} else if a != mat.epsilon {

 				// Try again with epsilon symbol, in case everything else failed
 				t0 = epsilonState
 				epsilonState = 0 // reset
 				buffo = epsilonOffset
 				a = mat.epsilon

 				if DEBUG {
 					fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
 				}

 			} else {
 				break
 			}

 			newchar = false
 			continue
 		}

 		// Transition was successful
 		rewindBuffer = false

 		// Transition consumes a character
 		if a != mat.epsilon {

 			buffo++

 			// Transition does not produce a character
 			// if buffo == 1 && ta.isNonToken() {
 			if buffo == 1 && t < 0 {
 				if DEBUG {
 					fmt.Println("Nontoken forward", showBuffer(buffer, buffo, buffi))
 				}
 				rewindBuffer = true
 			}

 		} else {
 			// Transition marks the end of a token - so flush the buffer

 			if buffi > 0 {
 				if DEBUG {
 					fmt.Println("-> Flush buffer: [", string(buffer[:buffo]), "]", showBuffer(buffer, buffo, buffi))
 				}
 				writer.WriteString(string(buffer[:buffo]))
 				rewindBuffer = true
 			}
 			if DEBUG {
 				fmt.Println("-> Newline")
 			}
 			writer.WriteRune('\n')
 		}

 		// Rewind the buffer if necessary
 		if rewindBuffer {

 			// TODO: Better as a ring buffer
 			for x, i := range buffer[buffo:buffi] {
 				buffer[x] = i
 			}

 			buffi -= buffo
 			epsilonOffset -= buffo
 			buffo = 0
 			if DEBUG {
 				fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
 			}
 		}

 		// Move to representative state
 		/*
 			if ta.isSeparate() {
 				t = ta.getBase()
 				ta = dat.array[t]

 				if DEBUG {
 					fmt.Println("Representative pointing to", t)
 				}
 			}
 		*/

 		// Ignore nontoken mark
 		if t < 0 {
 			t *= -1
 		}

 		newchar = true

 		// TODO:
 		//   Prevent endless epsilon loops!
 	}

 	// Input reader is not yet finished
 	if !eof {
 		if DEBUG {
 			fmt.Println("Not at the end")
 		}
 		return false
 	}

 	if DEBUG {
 		fmt.Println("Entering final check")
 	}
 	/*
 		// Automaton is in a final state, so flush the buffer and return
 		x := dat.array[t].getBase() + uint32(dat.final)

 		if x < dat.array[1].getCheck() && dat.array[x].getCheck() == t {

 			if buffi > 0 {
 				if DEBUG {
 					fmt.Println("-> Flush buffer: [", string(buffer[:buffi]), "]")
 				}
 				writer.WriteString(string(buffer[:buffi]))

 				if dat.array[t].isTokenEnd() {
 					writer.WriteRune('\n')
 					if DEBUG {
 						fmt.Println("-> Newline")
 					}
 				}
 			}

 			// Add an additional sentence ending, if the file is over but no explicit
 			// sentence split was reached. This may be controversial and therefore
 			// optional via parameter.
 			if !dat.array[t0].isTokenEnd() {
 				writer.WriteRune('\n')
 				if DEBUG {
 					fmt.Println("-> Newline")
 				}
 			}

 			// TODO:
 			//   There may be a new line at the end, from an epsilon,
 			//   so we may need to go on!
 			return true
 		}
 	*/

 	// Check epsilon transitions until a final state is reached
 	t0 = t
 	// t = dat.array[t0].getBase() + uint32(dat.epsilon)
 	t = mat.array[(int(mat.epsilon)-1)*mat.stateCount+int(t0)]
 	a = mat.epsilon
 	newchar = false
 	// if dat.array[t].getCheck() == t0 {
 	// t can't be < 0
 	if t > 0 {
 		// Remember state for backtracking to last tokenend state
 		goto PARSECHARM

 	} else if epsilonState != 0 {
 		t0 = epsilonState
 		epsilonState = 0 // reset
 		buffo = epsilonOffset
 		if DEBUG {
 			fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
 		}
 		goto PARSECHARM
 	}
 	return false

 }
	package datok

	import (
	"bufio"
	"fmt"
	"io"
	)

	type MatrixTokenizer struct {
	sigma map[rune]int
	sigmaASCII [256]int
	array []int
	stateCount int

	// Special symbols in sigma
	epsilon int
	unknown int
	identity int
	final int
	tokenend int
	}

	// ToMatrix turns the intermediate tokenizer into a
	// matrix representation.
	func (auto Automaton) ToMatrix() MatrixTokenizer {

	mat := &MatrixTokenizer{
	sigma: make(map[rune]int),
	final: auto.final,
	unknown: auto.unknown,
	identity: auto.identity,
	epsilon: auto.epsilon,
	tokenend: auto.tokenend,
	stateCount: auto.stateCount,
	}

	mat.array = make([]int, (auto.stateCount+1)*(auto.sigmaCount+1))

	for num, sym := range auto.sigmaRev {
	if int(sym) < 256 {
	mat.sigmaASCII[int(sym)] = num
	}
	mat.sigma[sym] = num
	if num > auto.sigmaCount {
	panic("sigmaCount is smaller")
	}
	}
	remember := make([]bool, auto.stateCount+2)

	// Store all transitions in matrix
	var toMatrix func([]int, int)

	toMatrix = func(matrix []int, start int) {
	if start > auto.stateCount {
	panic("stateCount is smaller")
	}
	if remember[start] {
	return
	}
	remember[start] = true
	for alpha, t := range auto.transitions[start] {
	matrix[(alpha-1)*auto.stateCount+start] = t.end

	// Mark nontoken transitions
	if t.nontoken {
	matrix[(alpha-1)auto.stateCount+start] = -1
	}

	toMatrix(matrix, t.end)
	}
	}

	toMatrix(mat.array, 1)

	return mat
	}

	func (mat *MatrixTokenizer) Transduce(r io.Reader, w io.Writer) bool {
	var a int
	var t0 int
	t := int(1) // Initial state
	var ok, rewindBuffer bool

	// Remember the last position of a possible tokenend,
	// in case the automaton fails.
	epsilonState := int(0)
	epsilonOffset := 0

	buffer := make([]rune, 1024)
	buffo := 0 // Buffer offset
	buffi := 0 // Buffer length

	reader := bufio.NewReader(r)
	writer := bufio.NewWriter(w)
	defer writer.Flush()

	var char rune

	var err error
	eof := false
	newchar := true

	PARSECHARM:
	for {

	if newchar {
	// Get from reader if buffer is empty
	if buffo >= buffi {
	if eof {
	break
	}
	char, _, err = reader.ReadRune()

	// No more runes to read
	if err != nil {
	eof = true
	break
	}
	buffer[buffi] = char
	buffi++
	}

	char = buffer[buffo]

	if DEBUG {
	fmt.Println("Current char", string(char), showBuffer(buffer, buffo, buffi))
	}

	// TODO:
	// Better not repeatedly check for a!
	// Possibly keep a buffer with a.
	if int(char) < 256 {
	a = mat.sigmaASCII[int(char)]
	} else {
	a, ok = mat.sigma[char]
	if !ok {
	a = 0
	}
	}

	// Use identity symbol if character is not in sigma
	if a == 0 && mat.identity != -1 {
	a = mat.identity
	}

	t0 = t

	// Check for epsilon transitions and remember

	if mat.array[(mat.epsilon-1)*mat.stateCount+t0] != 0 {
	// Remember state for backtracking to last tokenend state
	epsilonState = t0
	epsilonOffset = buffo
	}
	}

	// Checks a transition based on t0, a and buffo
	t = mat.array[(int(a)-1)*mat.stateCount+int(t0)]
	// t = mat.array[t0].getBase() + uint32(a)
	// ta := dat.array[t]

	if DEBUG {
	// Char is only relevant if set
	fmt.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
	/*
	if false {
	fmt.Println(dat.outgoing(t0))
	}
	*/
	}

	// Check if the transition is invalid according to the double array
	// if t > dat.array[1].getCheck() \|\| ta.getCheck() != t0 {
	if t == 0 {

	if DEBUG {
	fmt.Println("Match is not fine!")
	}

	if !ok && a == mat.identity {

	// Try again with unknown symbol, in case identity failed
	// Char is only relevant when set
	if DEBUG {
	fmt.Println("UNKNOWN symbol", string(char), "->", mat.unknown)
	}
	a = mat.unknown

	} else if a != mat.epsilon {

	// Try again with epsilon symbol, in case everything else failed
	t0 = epsilonState
	epsilonState = 0 // reset
	buffo = epsilonOffset
	a = mat.epsilon

	if DEBUG {
	fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
	}

	} else {
	break
	}

	newchar = false
	continue
	}

	// Transition was successful
	rewindBuffer = false

	// Transition consumes a character
	if a != mat.epsilon {

	buffo++

	// Transition does not produce a character
	// if buffo == 1 && ta.isNonToken() {
	if buffo == 1 && t < 0 {
	if DEBUG {
	fmt.Println("Nontoken forward", showBuffer(buffer, buffo, buffi))
	}
	rewindBuffer = true
	}

	} else {
	// Transition marks the end of a token - so flush the buffer

	if buffi > 0 {
	if DEBUG {
	fmt.Println("-> Flush buffer: [", string(buffer[:buffo]), "]", showBuffer(buffer, buffo, buffi))
	}
	writer.WriteString(string(buffer[:buffo]))
	rewindBuffer = true
	}
	if DEBUG {
	fmt.Println("-> Newline")
	}
	writer.WriteRune('\n')
	}

	// Rewind the buffer if necessary
	if rewindBuffer {

	// TODO: Better as a ring buffer
	for x, i := range buffer[buffo:buffi] {
	buffer[x] = i
	}

	buffi -= buffo
	epsilonOffset -= buffo
	buffo = 0
	if DEBUG {
	fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
	}
	}

	// Move to representative state
	/*
	if ta.isSeparate() {
	t = ta.getBase()
	ta = dat.array[t]

	if DEBUG {
	fmt.Println("Representative pointing to", t)
	}
	}
	*/

	// Ignore nontoken mark
	if t < 0 {
	t *= -1
	}

	newchar = true

	// TODO:
	// Prevent endless epsilon loops!
	}

	// Input reader is not yet finished
	if !eof {
	if DEBUG {
	fmt.Println("Not at the end")
	}
	return false
	}

	if DEBUG {
	fmt.Println("Entering final check")
	}
	/*
	// Automaton is in a final state, so flush the buffer and return
	x := dat.array[t].getBase() + uint32(dat.final)

	if x < dat.array[1].getCheck() && dat.array[x].getCheck() == t {

	if buffi > 0 {
	if DEBUG {
	fmt.Println("-> Flush buffer: [", string(buffer[:buffi]), "]")
	}
	writer.WriteString(string(buffer[:buffi]))

	if dat.array[t].isTokenEnd() {
	writer.WriteRune('\n')
	if DEBUG {
	fmt.Println("-> Newline")
	}
	}
	}

	// Add an additional sentence ending, if the file is over but no explicit
	// sentence split was reached. This may be controversial and therefore
	// optional via parameter.
	if !dat.array[t0].isTokenEnd() {
	writer.WriteRune('\n')
	if DEBUG {
	fmt.Println("-> Newline")
	}
	}

	// TODO:
	// There may be a new line at the end, from an epsilon,
	// so we may need to go on!
	return true
	}
	*/

	// Check epsilon transitions until a final state is reached
	t0 = t
	// t = dat.array[t0].getBase() + uint32(dat.epsilon)
	t = mat.array[(int(mat.epsilon)-1)*mat.stateCount+int(t0)]
	a = mat.epsilon
	newchar = false
	// if dat.array[t].getCheck() == t0 {
	// t can't be < 0
	if t > 0 {
	// Remember state for backtracking to last tokenend state
	goto PARSECHARM

	} else if epsilonState != 0 {
	t0 = epsilonState
	epsilonState = 0 // reset
	buffo = epsilonOffset
	if DEBUG {
	fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
	}
	goto PARSECHARM
	}
	return false

	}