Introduce alternative matrix representation
diff --git a/datok.go b/datok.go
index b46da42..4f7ac6f 100644
--- a/datok.go
+++ b/datok.go
@@ -80,21 +80,21 @@
// into a double array representation.
//
// This is based on Mizobuchi et al (2000), p.128
-func (tok *Tokenizer) ToDoubleArray() *DaTokenizer {
+func (auto *Automaton) ToDoubleArray() *DaTokenizer {
dat := &DaTokenizer{
sigma: make(map[rune]int),
transCount: -1,
- final: tok.final,
- unknown: tok.unknown,
- identity: tok.identity,
- epsilon: tok.epsilon,
- tokenend: tok.tokenend,
+ final: auto.final,
+ unknown: auto.unknown,
+ identity: auto.identity,
+ epsilon: auto.epsilon,
+ tokenend: auto.tokenend,
}
dat.resize(dat.final)
- for num, sym := range tok.sigmaRev {
+ for num, sym := range auto.sigmaRev {
if int(sym) < 256 {
dat.sigmaASCII[int(sym)] = num
}
@@ -110,7 +110,7 @@
// Create a mapping from s (in Ms aka Intermediate FSA)
// to t (in Mt aka Double Array FSA)
- table := make([]*mapping, tok.arcCount+1)
+ table := make([]*mapping, auto.arcCount+1)
// tableQueue := make([]int, tok.arcCount+1)
// Initialize with the start state
@@ -119,7 +119,7 @@
size++
// Allocate space for the outgoing symbol range
- A := make([]int, 0, tok.sigmaCount)
+ A := make([]int, 0, auto.sigmaCount)
// tableLookup := make([]uint32, tok.arcCount+2) // make(map[int]uint32)
// tableLookup[1] = 1
@@ -135,7 +135,7 @@
// Following the paper, here the state t can be remembered
// in the set of states St
A = A[:0]
- tok.getSet(s, &A)
+ auto.getSet(s, &A)
// Set base to the first free slot in the double array
// base = dat.xCheck(A)
@@ -151,9 +151,9 @@
// Iterate over all outgoing symbols
for _, a := range A {
- if a != tok.final {
+ if a != auto.final {
- atrans = tok.transitions[s][a]
+ atrans = auto.transitions[s][a]
// Aka g(s, a)
s1 = atrans.end
diff --git a/datok_test.go b/datok_test.go
index 2dae904..67617b2 100644
--- a/datok_test.go
+++ b/datok_test.go
@@ -11,15 +11,15 @@
"github.com/stretchr/testify/assert"
)
-func tmatch(dat *DaTokenizer, s string) bool {
+func tmatch(tok Tokenizer, s string) bool {
b := make([]byte, 0, 2048)
w := bytes.NewBuffer(b)
- return dat.Transduce(strings.NewReader(s), w)
+ return tok.Transduce(strings.NewReader(s), w)
}
-func ttokenize(dat *DaTokenizer, w *bytes.Buffer, str string) []string {
+func ttokenize(tok Tokenizer, w *bytes.Buffer, str string) []string {
w.Reset()
- ok := dat.Transduce(strings.NewReader(str), w)
+ ok := tok.Transduce(strings.NewReader(str), w)
if !ok {
return []string{}
}
diff --git a/fomafile.go b/fomafile.go
index 46f9d71..1d46500 100644
--- a/fomafile.go
+++ b/fomafile.go
@@ -27,13 +27,18 @@
tokenend bool
}
-// Tokenizer is the intermediate representation
+type Tokenizer interface {
+ Transduce(r io.Reader, w io.Writer) bool
+}
+
+// Automaton is the intermediate representation
// of the tokenizer.
-type Tokenizer struct {
+type Automaton struct {
sigmaRev map[int]rune
sigmaMCS map[int]string
arcCount int
sigmaCount int
+ stateCount int
transitions []map[int]*edge
// Special symbols in sigma
@@ -47,7 +52,7 @@
// ParseFoma reads the FST from a foma file
// and creates an internal representation,
// in case it follows the tokenizer's convention.
-func LoadFomaFile(file string) *Tokenizer {
+func LoadFomaFile(file string) *Automaton {
f, err := os.Open(file)
if err != nil {
log.Print(err)
@@ -68,10 +73,10 @@
// ParseFoma reads the FST from a foma file reader
// and creates an internal representation,
// in case it follows the tokenizer's convention.
-func ParseFoma(ior io.Reader) *Tokenizer {
+func ParseFoma(ior io.Reader) *Automaton {
r := bufio.NewReader(ior)
- tok := &Tokenizer{
+ auto := &Automaton{
sigmaRev: make(map[int]rune),
sigmaMCS: make(map[int]string),
epsilon: -1,
@@ -111,8 +116,8 @@
// Adds a final transition symbol to sigma
// written as '#' in Mizobuchi et al (2000)
- tok.sigmaCount++
- tok.final = tok.sigmaCount
+ auto.sigmaCount++
+ auto.final = auto.sigmaCount
} else if strings.HasPrefix(line, "##sigma##") {
@@ -164,7 +169,7 @@
log.Print("Can't read arccount")
return nil
}
- tok.arcCount = elemint[0]
+ auto.arcCount = elemint[0]
elemint[0], err = strconv.Atoi(elem[2])
if err != nil {
@@ -175,7 +180,8 @@
// States start at 1 in Mizobuchi et al (2000),
// as the state 0 is associated with a fail.
// Initialize states and transitions
- tok.transitions = make([]map[int]*edge, elemint[0]+1)
+ auto.stateCount = elemint[0]
+ auto.transitions = make([]map[int]*edge, elemint[0]+1)
continue
}
case STATES:
@@ -241,13 +247,13 @@
if final == 1 {
// Initialize outgoing states
- if tok.transitions[state+1] == nil {
- tok.transitions[state+1] = make(map[int]*edge)
+ if auto.transitions[state+1] == nil {
+ auto.transitions[state+1] = make(map[int]*edge)
}
// TODO:
// Maybe this is less relevant for tokenizers
- tok.transitions[state+1][tok.final] = &edge{}
+ auto.transitions[state+1][auto.final] = &edge{}
}
continue
} else {
@@ -283,9 +289,9 @@
// Only a limited list of transitions are allowed
if inSym != outSym {
- if outSym == tok.tokenend && inSym == tok.epsilon {
+ if outSym == auto.tokenend && inSym == auto.epsilon {
tokenend = true
- } else if outSym == tok.epsilon {
+ } else if outSym == auto.epsilon {
nontoken = true
} else {
log.Println(
@@ -297,19 +303,19 @@
":" +
strconv.Itoa(outSym) +
") (" +
- string(tok.sigmaRev[inSym]) +
+ string(auto.sigmaRev[inSym]) +
":" +
- string(tok.sigmaRev[outSym]) +
+ string(auto.sigmaRev[outSym]) +
")")
return nil
}
- } else if inSym == tok.tokenend {
+ } else if inSym == auto.tokenend {
// Ignore tokenend accepting arcs
continue
- } else if inSym == tok.epsilon {
+ } else if inSym == auto.epsilon {
log.Println("General epsilon transitions are not supported")
return nil
- } else if tok.sigmaMCS[inSym] != "" {
+ } else if auto.sigmaMCS[inSym] != "" {
// log.Fatalln("Non supported character", tok.sigmaMCS[inSym])
// Ignore MCS transitions
continue
@@ -325,20 +331,20 @@
}
// Initialize outgoing states
- if tok.transitions[state+1] == nil {
- tok.transitions[state+1] = make(map[int]*edge)
+ if auto.transitions[state+1] == nil {
+ auto.transitions[state+1] = make(map[int]*edge)
}
// Ignore transitions with invalid symbols
if inSym >= 0 {
- tok.transitions[state+1][inSym] = targetObj
+ auto.transitions[state+1][inSym] = targetObj
}
// Add final transition
if final == 1 {
// TODO:
// Maybe this is less relevant for tokenizers
- tok.transitions[state+1][tok.final] = &edge{}
+ auto.transitions[state+1][auto.final] = &edge{}
}
if DEBUG {
@@ -349,9 +355,9 @@
":",
outSym,
") (",
- string(tok.sigmaRev[inSym]),
+ string(auto.sigmaRev[inSym]),
":",
- string(tok.sigmaRev[outSym]),
+ string(auto.sigmaRev[outSym]),
")",
";",
"TE:", tokenend,
@@ -376,7 +382,7 @@
return nil
}
- tok.sigmaCount = number
+ auto.sigmaCount = number
var symbol rune
@@ -390,26 +396,26 @@
switch elem[1] {
case "@_EPSILON_SYMBOL_@":
{
- tok.epsilon = number
+ auto.epsilon = number
}
case "@_UNKNOWN_SYMBOL_@":
{
- tok.unknown = number
+ auto.unknown = number
}
case "@_IDENTITY_SYMBOL_@":
{
- tok.identity = number
+ auto.identity = number
}
case "@_TOKEN_SYMBOL_@":
{
- tok.tokenend = number
+ auto.tokenend = number
}
default:
{
// MCS not supported
- tok.sigmaMCS[number] = line
+ auto.sigmaMCS[number] = line
}
}
continue
@@ -422,24 +428,24 @@
}
if len(line) != 1 {
// MCS not supported
- tok.sigmaMCS[number] = line
+ auto.sigmaMCS[number] = line
continue
}
symbol = rune('\n')
}
- tok.sigmaRev[number] = symbol
+ auto.sigmaRev[number] = symbol
}
}
}
- tok.sigmaMCS = nil
- return tok
+ auto.sigmaMCS = nil
+ return auto
}
// Set alphabet A to the list of all symbols
// outgoing from s
-func (tok *Tokenizer) getSet(s int, A *[]int) {
- for a := range tok.transitions[s] {
+func (auto *Automaton) getSet(s int, A *[]int) {
+ for a := range auto.transitions[s] {
*A = append(*A, a)
}
diff --git a/matrix.go b/matrix.go
new file mode 100644
index 0000000..0ec940e
--- /dev/null
+++ b/matrix.go
@@ -0,0 +1,352 @@
+package datok
+
+import (
+ "bufio"
+ "fmt"
+ "io"
+)
+
+type MatrixTokenizer struct {
+ sigma map[rune]int
+ sigmaASCII [256]int
+ array []int
+ stateCount int
+
+ // Special symbols in sigma
+ epsilon int
+ unknown int
+ identity int
+ final int
+ tokenend int
+}
+
+// ToMatrix turns the intermediate tokenizer into a
+// matrix representation.
+func (auto *Automaton) ToMatrix() *MatrixTokenizer {
+
+ mat := &MatrixTokenizer{
+ sigma: make(map[rune]int),
+ final: auto.final,
+ unknown: auto.unknown,
+ identity: auto.identity,
+ epsilon: auto.epsilon,
+ tokenend: auto.tokenend,
+ stateCount: auto.stateCount,
+ }
+
+ mat.array = make([]int, (auto.stateCount+1)*(auto.sigmaCount+1))
+
+ for num, sym := range auto.sigmaRev {
+ if int(sym) < 256 {
+ mat.sigmaASCII[int(sym)] = num
+ }
+ mat.sigma[sym] = num
+ if num > auto.sigmaCount {
+ panic("sigmaCount is smaller")
+ }
+ }
+ remember := make([]bool, auto.stateCount+2)
+
+ // Store all transitions in matrix
+ var toMatrix func([]int, int)
+
+ toMatrix = func(matrix []int, start int) {
+ if start > auto.stateCount {
+ panic("stateCount is smaller")
+ }
+ if remember[start] {
+ return
+ }
+ remember[start] = true
+ for alpha, t := range auto.transitions[start] {
+ matrix[(alpha-1)*auto.stateCount+start] = t.end
+
+ // Mark nontoken transitions
+ if t.nontoken {
+ matrix[(alpha-1)*auto.stateCount+start] *= -1
+ }
+
+ toMatrix(matrix, t.end)
+ }
+ }
+
+ toMatrix(mat.array, 1)
+
+ return mat
+}
+
+func (mat *MatrixTokenizer) Transduce(r io.Reader, w io.Writer) bool {
+ var a int
+ var t0 int
+ t := int(1) // Initial state
+ var ok, rewindBuffer bool
+
+ // Remember the last position of a possible tokenend,
+ // in case the automaton fails.
+ epsilonState := int(0)
+ epsilonOffset := 0
+
+ buffer := make([]rune, 1024)
+ buffo := 0 // Buffer offset
+ buffi := 0 // Buffer length
+
+ reader := bufio.NewReader(r)
+ writer := bufio.NewWriter(w)
+ defer writer.Flush()
+
+ var char rune
+
+ var err error
+ eof := false
+ newchar := true
+
+PARSECHARM:
+ for {
+
+ if newchar {
+ // Get from reader if buffer is empty
+ if buffo >= buffi {
+ if eof {
+ break
+ }
+ char, _, err = reader.ReadRune()
+
+ // No more runes to read
+ if err != nil {
+ eof = true
+ break
+ }
+ buffer[buffi] = char
+ buffi++
+ }
+
+ char = buffer[buffo]
+
+ if DEBUG {
+ fmt.Println("Current char", string(char), showBuffer(buffer, buffo, buffi))
+ }
+
+ // TODO:
+ // Better not repeatedly check for a!
+ // Possibly keep a buffer with a.
+ if int(char) < 256 {
+ a = mat.sigmaASCII[int(char)]
+ } else {
+ a, ok = mat.sigma[char]
+ if !ok {
+ a = 0
+ }
+ }
+
+ // Use identity symbol if character is not in sigma
+ if a == 0 && mat.identity != -1 {
+ a = mat.identity
+ }
+
+ t0 = t
+
+ // Check for epsilon transitions and remember
+
+ if mat.array[(mat.epsilon-1)*mat.stateCount+t0] != 0 {
+ // Remember state for backtracking to last tokenend state
+ epsilonState = t0
+ epsilonOffset = buffo
+ }
+ }
+
+ // Checks a transition based on t0, a and buffo
+ t = mat.array[(int(a)-1)*mat.stateCount+int(t0)]
+ // t = mat.array[t0].getBase() + uint32(a)
+ // ta := dat.array[t]
+
+ if DEBUG {
+ // Char is only relevant if set
+ fmt.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
+ /*
+ if false {
+ fmt.Println(dat.outgoing(t0))
+ }
+ */
+ }
+
+ // Check if the transition is invalid according to the double array
+ // if t > dat.array[1].getCheck() || ta.getCheck() != t0 {
+ if t == 0 {
+
+ if DEBUG {
+ fmt.Println("Match is not fine!")
+ }
+
+ if !ok && a == mat.identity {
+
+ // Try again with unknown symbol, in case identity failed
+ // Char is only relevant when set
+ if DEBUG {
+ fmt.Println("UNKNOWN symbol", string(char), "->", mat.unknown)
+ }
+ a = mat.unknown
+
+ } else if a != mat.epsilon {
+
+ // Try again with epsilon symbol, in case everything else failed
+ t0 = epsilonState
+ epsilonState = 0 // reset
+ buffo = epsilonOffset
+ a = mat.epsilon
+
+ if DEBUG {
+ fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
+ }
+
+ } else {
+ break
+ }
+
+ newchar = false
+ continue
+ }
+
+ // Transition was successful
+ rewindBuffer = false
+
+ // Transition consumes a character
+ if a != mat.epsilon {
+
+ buffo++
+
+ // Transition does not produce a character
+ // if buffo == 1 && ta.isNonToken() {
+ if buffo == 1 && t < 0 {
+ if DEBUG {
+ fmt.Println("Nontoken forward", showBuffer(buffer, buffo, buffi))
+ }
+ rewindBuffer = true
+ }
+
+ } else {
+ // Transition marks the end of a token - so flush the buffer
+
+ if buffi > 0 {
+ if DEBUG {
+ fmt.Println("-> Flush buffer: [", string(buffer[:buffo]), "]", showBuffer(buffer, buffo, buffi))
+ }
+ writer.WriteString(string(buffer[:buffo]))
+ rewindBuffer = true
+ }
+ if DEBUG {
+ fmt.Println("-> Newline")
+ }
+ writer.WriteRune('\n')
+ }
+
+ // Rewind the buffer if necessary
+ if rewindBuffer {
+
+ // TODO: Better as a ring buffer
+ for x, i := range buffer[buffo:buffi] {
+ buffer[x] = i
+ }
+
+ buffi -= buffo
+ epsilonOffset -= buffo
+ buffo = 0
+ if DEBUG {
+ fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
+ }
+ }
+
+ // Move to representative state
+ /*
+ if ta.isSeparate() {
+ t = ta.getBase()
+ ta = dat.array[t]
+
+ if DEBUG {
+ fmt.Println("Representative pointing to", t)
+ }
+ }
+ */
+
+ // Ignore nontoken mark
+ if t < 0 {
+ t *= -1
+ }
+
+ newchar = true
+
+ // TODO:
+ // Prevent endless epsilon loops!
+ }
+
+ // Input reader is not yet finished
+ if !eof {
+ if DEBUG {
+ fmt.Println("Not at the end")
+ }
+ return false
+ }
+
+ if DEBUG {
+ fmt.Println("Entering final check")
+ }
+ /*
+ // Automaton is in a final state, so flush the buffer and return
+ x := dat.array[t].getBase() + uint32(dat.final)
+
+ if x < dat.array[1].getCheck() && dat.array[x].getCheck() == t {
+
+ if buffi > 0 {
+ if DEBUG {
+ fmt.Println("-> Flush buffer: [", string(buffer[:buffi]), "]")
+ }
+ writer.WriteString(string(buffer[:buffi]))
+
+ if dat.array[t].isTokenEnd() {
+ writer.WriteRune('\n')
+ if DEBUG {
+ fmt.Println("-> Newline")
+ }
+ }
+ }
+
+ // Add an additional sentence ending, if the file is over but no explicit
+ // sentence split was reached. This may be controversial and therefore
+ // optional via parameter.
+ if !dat.array[t0].isTokenEnd() {
+ writer.WriteRune('\n')
+ if DEBUG {
+ fmt.Println("-> Newline")
+ }
+ }
+
+ // TODO:
+ // There may be a new line at the end, from an epsilon,
+ // so we may need to go on!
+ return true
+ }
+ */
+
+ // Check epsilon transitions until a final state is reached
+ t0 = t
+ // t = dat.array[t0].getBase() + uint32(dat.epsilon)
+ t = mat.array[(int(mat.epsilon)-1)*mat.stateCount+int(t0)]
+ a = mat.epsilon
+ newchar = false
+ // if dat.array[t].getCheck() == t0 {
+ // t can't be < 0
+ if t > 0 {
+ // Remember state for backtracking to last tokenend state
+ goto PARSECHARM
+
+ } else if epsilonState != 0 {
+ t0 = epsilonState
+ epsilonState = 0 // reset
+ buffo = epsilonOffset
+ if DEBUG {
+ fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
+ }
+ goto PARSECHARM
+ }
+ return false
+
+}
diff --git a/matrix_test.go b/matrix_test.go
new file mode 100644
index 0000000..18ed3a2
--- /dev/null
+++ b/matrix_test.go
@@ -0,0 +1,33 @@
+package datok
+
+import (
+ "bytes"
+ "strings"
+ "testing"
+
+ "github.com/stretchr/testify/assert"
+)
+
+func TestFullTokenizerMatrix(t *testing.T) {
+ assert := assert.New(t)
+ foma := LoadFomaFile("testdata/simpletok.fst")
+ assert.NotNil(foma)
+
+ mat := foma.ToMatrix()
+
+ r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
+ b := make([]byte, 0, 2048)
+ w := bytes.NewBuffer(b)
+ var tokens []string
+ mat.Transduce(r, w)
+ tokens = strings.Split(w.String(), "\n")
+ assert.Equal(len(tokens), 9)
+ assert.Equal("wald", tokens[0])
+ assert.Equal("gehen", tokens[1])
+ assert.Equal("Da", tokens[2])
+ assert.Equal("kann", tokens[3])
+ assert.Equal("man", tokens[4])
+ assert.Equal("was", tokens[5])
+ assert.Equal("\"erleben\"", tokens[6])
+ assert.Equal("!", tokens[7])
+}